522 files changed, 18346 insertions, 5353 deletions
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d8d18693efc..3e81444ea3d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -164,6 +164,10 @@ README*                                         @ceph/doc-writers
 /src/cls/rgw_gc                                 @ceph/rgw
 /src/cls/user                                   @ceph/rgw
 /src/cls/version                                @ceph/rgw
+/src/mrgw.sh                                    @ceph/rgw
+/src/mrun                                       @ceph/rgw
+/src/mstart.sh                                  @ceph/rgw
+/src/mstop.sh                                   @ceph/rgw
 /src/rgw                                        @ceph/rgw
 /src/s3select                                   @ceph/rgw
 /src/spawn                                      @ceph/rgw
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 1b50ff7c5a3..cc32be38501 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -263,6 +263,19 @@ rbd:
   - systemd/rbdmap.service.in
   - udev/50-rbd.rules
 
+nvmeof:
+  - qa/suites/nvmeof/**
+  - qa/tasks/nvmeof.py
+  - qa/workunits/nvmeof/**
+  - src/ceph_nvmeof_monitor_client.cc
+  - src/cephadm/cephadmlib/daemons/nvmeof.py
+  - src/messages/MNVMeofGw*
+  - src/mon/NVMeofGw*
+  - src/nvmeof/**
+  - src/pybind/mgr/cephadm/services/nvmeof.py
+  - src/pybind/mgr/cephadm/templates/services/nvmeof/**
+  - src/tools/ceph-dencoder/nvmeof*
+
 rgw:
   - qa/suites/rgw/**
   - qa/tasks/rgw*
@@ -275,6 +288,9 @@ rgw:
   - src/cls/rgw_gc/**
   - src/cls/timeindex/**
   - src/mrgw.sh
+  - src/mrun
+  - src/mstart.sh
+  - src/mstop.sh
   - src/rgw/**
   - src/test/cls_rgw/**
   - src/test/librgw_*
diff --git a/.github/workflows/check-license.yml b/.github/workflows/check-license.yml
new file mode 100644
index 00000000000..89dcfa292c3
--- /dev/null
+++ b/.github/workflows/check-license.yml
@@ -0,0 +1,14 @@
+---
+name: "Check for Incompatible Licenses"
+on: [pull_request]
+
+jobs:
+  pull_request:
+    name: "Check for Incompatible Licenses"
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check Pull Request
+      uses: JJ/github-pr-contains-action@526dfe784d8604ea1c39b6c26609074de95b1ffd  # releases/v14.1
+      with:
+        github-token: ${{github.token}}
+        diffDoesNotContain: "GNU General Public License"
diff --git a/.githubmap b/.githubmap
index b93132cf1ee..68c711aa587 100644
--- a/.githubmap
+++ b/.githubmap
@@ -12,6 +12,7 @@ aaSharma14 Aashish Sharma <aasharma@redhat.com>
 aclamk Adam Kupczyk <akupczyk@redhat.com>
 adamemerson Adam C. Emerson <aemerson@redhat.com>
 adk3798 Adam King <adking@redhat.com>
+afreen23 Afreen Misbah <afreen@ibm.com>
 ajarr Ramana Raja <rraja@redhat.com>
 alfonsomthd Alfonso Martínez <almartin@redhat.com>
 alfredodeza Alfredo Deza <adeza@redhat.com>
@@ -19,6 +20,7 @@ alimaredia Ali Maredia <amaredia@redhat.com>
 amathuria Aishwarya Mathuria <amathuri@redhat.com>
 amitkumar50 Amit Kumar <amitkuma@redhat.com>
 andrewschoen Andrew Schoen <aschoen@redhat.com>
+anuradhagadge Anuradha Gadge <Anuradha.Gadge@ibm.com>
 aaryanporwal Aaryan Porwal <aaryanporwal2233@gmail.com>
 asettle Alexandra Settle <asettle@suse.com>
 athanatos Samuel Just <sjust@redhat.com>
@@ -27,7 +29,7 @@ b-ranto Boris Ranto <branto@redhat.com>
 badone Brad Hubbard <bhubbard@redhat.com>
 baruza Barbora Ančincová <bara@redhat.com>
 bassamtabbara Bassam Tabbara <bassam.tabbara@quantum.com>
-batrick Patrick Donnelly <pdonnell@redhat.com>
+batrick Patrick Donnelly <pdonnell@ibm.com>
 bigjust Justin Caratzas <jcaratza@redhat.com>
 bk201 Kiefer Chang <kiefer.chang@suse.com>
 BlaineEXE Blaine Gardner <bgardner@suse.com>
@@ -47,6 +49,7 @@ Devp00l Stephan Müller <smueller@suse.com>
 dillaman Jason Dillaman <dillaman@redhat.com>
 djgalloway David Galloway <dgallowa@redhat.com>
 dmick Dan Mick <dmick@redhat.com>
+dnyanee1997 Dnyaneshwari talwekar <dtalweka@redhat.com>
 dragonylffly Li Wang <laurence.liwang@gmail.com>
 dsavineau Dimitri Savineau <dsavinea@redhat.com>
 dvanders Dan van der Ster <dan.vanderster@clyso.com>
@@ -96,6 +99,7 @@ mikechristie Mike Christie <mchristi@redhat.com>
 mogeb Mohamad Gebai <mgebai@suse.com>
 MrFreezeex Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@cern.ch>
 myoungwon Myoungwon Oh <myoungwon.oh@samsung.com>
+nmunet Naman Munet <nmunet@redhat.com>
 Naveenaidu Naveen Naidu <naveen.naidu@ibm.com>
 neha-ojha Neha Ojha <nojha@redhat.com>
 NitzanMordhai Nitzan Mordechai <nmordech@redhat.com>
@@ -109,6 +113,8 @@ p-se Patrick Seidensal <pseidensal@suse.com>
 pcuzner Paul Cuzner <pcuzner@redhat.com>
 Pegonzal Pedro Gonzalez Gomez <pegonzal@redhat.com>
 pereman2 Pere Diaz Bou <pdiazbou@redhat.com>
+prgoel-code Prachi prgoel@redhat.com
+pujaoshahu Puja Shahu <pshahu@redhat.com>
 rchagam Anjaneya Chagam <anjaneya.chagam@intel.com>
 renhwztetecs huanwen ren <ren.huanwen@zte.com.cn>
 ricardoasmarques Ricardo Marques <rimarques@suse.com>
@@ -123,6 +129,8 @@ Sarthak0702 Sarthak Gupta <sarthak.dev.0702@gmail.com>
 saschagrunert Sascha Grunert <sgrunert@suse.com>
 sebastian-philipp Sebastian Wagner <sewagner@redhat.com>
 shraddhaag Shraddha Agrawal <shraddhaag@ibm.com>
+Kushal-deb Kushal Deb <Kushal.Deb@ibm.com>
+ShwetaBhosale1 Shweta Bhosale <Shweta.Bhosale1@ibm.com>
 ShyamsundarR Shyamsundar R <srangana@redhat.com>
 sidharthanup Sidharth Anupkrishnan <sanupkri@redhat.com>
 smithfarm Nathan Cutler <ncutler@suse.com>
@@ -179,3 +187,4 @@ zmc Zack Cerza <zack@redhat.com>
 robbat2 Robin H. Johnson <robbat2@orbis-terrarum.net>
 leonid-s-usov Leonid Usov <leonid.usov@ibm.com>
 ffilz Frank S. Filz <ffilzlnx@mindspring.com>
+Jayaprakash-ibm Jaya Prakash Madaka <jayaprakash@ibm.com>
diff --git a/.gitmodules b/.gitmodules
index eb51f2ef74a..4a20b958b56 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -82,4 +82,3 @@
 	path = src/nvmeof/gateway
 	url = https://github.com/ceph/ceph-nvmeof.git
 	fetchRecurseSubmodules = false
-	shallow = true
diff --git a/.mailmap b/.mailmap
index 8359b1473ae..6322c4ba523 100644
--- a/.mailmap
+++ b/.mailmap
@@ -24,6 +24,7 @@ Adam Kupczyk <akupczyk@redhat.com> <aclamk@gmail.com>
 Adam Kupczyk <akupczyk@redhat.com> <akucpzyk@redhat.com>
 Adam Twardowski <adam.twardowski@gmail.com>
 Adir Lev <adirl@mellanox.com>
+Afreen Misbah <afreen@ibm.com>
 Ahoussi Armand <ahoussi.say@telecom-bretagne.eu> <delco225>
 Ailing Zhang <zhangal1992@gmail.com> <ailzhang@users.noreply.github.com>
 Aishwarya Mathuria <amathuri@redhat.com> amathuria <NOT@FOUND>
@@ -63,6 +64,7 @@ Anthony D Atri <anthony.datri@gmail.com> <anthony.datri@indexexchange.com>
 Anthony D Atri <anthony.datri@gmail.com> anthonyeleven <NOT@FOUND>
 Anton Oks <anton.oks@gmx.de>
 Anton Turetckii <tyrchenok@gmail.com> banuchka <tyrchenok@gmail.com>
+Anuradha Gadge <anuradha.gadge@ibm.com> <anuradhagadge18@gmail.com>
 Anurag Bandhu <abandhu@redhat.com> <anurag@localhost.localdomain>
 Aravind Ramesh <Aravind.Ramesh@wdc.com> Aravind <aravind.ramesh@wdc.com>
 Aristoteles Neto <aristoteles.neto@webdrive.co.nz> <wdneto@users.noreply.github.com>
@@ -168,6 +170,7 @@ Dhairya Parmar <dparmar@redhat.com> dparmar18 <dparmar@redhat.com>
 Dingdang Zhang <boqian.zy@alibaba-inc.com>
 Dmitry Smirnov <onlyjob@member.fsf.org> <onlyjob@debian.org>
 Dmitry Yatsushkevich <dyatsushkevich@mirantis.com> <dmitry.yatsushkevich@gmail.com>
+Dnyaneshwari talwekar <dtalweka@redhat.com>
 Dominik Hannen <cantares1+github@gmail.com> <dhxgit@users.noreply.github.com>
 Dongdong Tao <dongodng.tao@canonical.com>
 Dongdong Tao <tdd21151186@gmail.com>
@@ -508,6 +511,7 @@ Myoungwon Oh <omwmw@sk.com>
 Myoungwon Oh <omwmw@sk.com> <ommw@sk.com>
 Na Xie <xie.na@h3c.com>
 Nag Pavan Chilakam <nagpavan.chilakam@gmail.com> <55574442+nagpavan-chilakam@users.noreply.github.com>
+Naman Munet <nmunet@redhat.com>
 Nancy Su <su_nan@inspur.com>
 Nathan Cutler <ncutler@suse.com>
 Nathan Cutler <ncutler@suse.com> <cutler@suse.cz>
@@ -544,7 +548,8 @@ Pan Liu <pan.liu@istuary.com> <liupan1111@gmail.com>
 Parth Arora <paarora@redhat.com> parth-gr <paarora@redhat.com>
 Pascal de Bruijn <pascal@unilogicnetworks.net>
 Patience Warnick <patience@cranium.pelton.net> <patiencew@29311d96-e01e-0410-9327-a35deaab8ce9>
-Patrick Donnelly <pdonnell@redhat.com> <pdonell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <pdonnell@redhat.com>
+Patrick Donnelly <pdonnell@ibm.com> <batrick@batbytes.com>
 Patrick McGarry <patrick@inktank.com>
 Patrick McGarry <pmcgarry@redhat.com> <pmcgarry@gmail.com>
 Patrick Seidensal <pseidensal@suse.com>
@@ -572,6 +577,8 @@ Pooja Gautam <pooja.gautam@ts.fujitsu.com>
 Pritha Srivastava <prsrivas@redhat.com>
 Pritha Srivastava <prsrivas@redhat.com> <pritha@dhcp35-190.lab.eng.blr.redhat.com>
 Pritha Srivastava <prsrivas@redhat.com> <prsivas@redhat.com>
+Prachi prgoel@redhat.com
+Puja Shahu <pshahu@redhat.com>
 Qi Liang Hong <qilianghong@huawei.com>
 Qiankun Zheng <zheng.qiankun@h3c.com>
 Qinfei Liu <lucas.liuqinfei@huawei.com> <18138800392@163.com>
@@ -674,12 +681,14 @@ Shiqi <m13913886148@gmail.com> <1454927420@qq.com>
 Shishir Gowda <shishir.gowda@sandisk.com>
 Shotaro Kawaguchi <kawaguchi.s@jp.fujitsu.com>
 Shraddha Agrawal <shraddhaag@ibm.com>
+Kushal Deb <Kushal.Deb@ibm.com>
 Shreyansh Sancheti <ssanchet@redhat.com> shreyanshjain7174 <ssanchet@redhat.com>
 Shu, Xinxin <xinxin.shu@intel.com>
 Shuai Yong <yongshuai@sangfor.com.cn>
 Shun Song <song.shun3@zte.com.cn>
 Shun Song <song.shun3@zte.com.cn> <root@clove83.zte.com.cn>
 Shun Song <song.shun3@zte.com.cn> <songshun134@126.com>
+Shweta Bhosale <Shweta.Bhosale1@ibm.com> <bhosaleshweta097@gmail.com>
 Shyamsundar R <srangana@redhat.com>
 Shylesh Kumar <shmohan@redhat.com> <shylesh.mohan@gmail.com>
 Sibei Gao <gaosb@inspur.com>
diff --git a/.organizationmap b/.organizationmap
index bc194953d1b..e59e6ae24e1 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -346,19 +346,28 @@ Huayun <contact@huayun.com> Zheng Yin <zhengyin@huayun.com>
 Huazhong University of Science and Technology <contact@hust.edu.cn> Luo Runbing <runsisi@hust.edu.cn>
 HXT Semiconductor <contact@hxt-semitech.org> Jiang Yutang <yutang2.jiang@hxt-semitech.com>
 IBM <contact@IBM.com> Adam Kupczyk <akupczyk@ibm.com>
+IBM <contact@IBM.com> Afreen Misbah <afreen@ibm.com>
 IBM <contact@IBM.com> Aliaksei Makarau <aliaksei.makarau@ibm.com>
 IBM <contact@IBM.com> Andrew Solomon <asolomon@us.ibm.com>
+IBM <contact@IBM.com> Anuradha Gadge <Anuradha.Gadge@ibm.com>
+IBM <contact@IBM.com> Dnyaneshwari talwekar <Dnyaneshwari.Talwekar@ibm.com>
 IBM <contact@IBM.com> Guillaume Abrioux <gabrioux@ibm.com>
 IBM <contact@IBM.com> Jonas Pfefferle <jpf@ibm.com>
 IBM <contact@IBM.com> Laura Flores <lflores@ibm.com>
 IBM <contact@IBM.com> Martin Ohmacht <mohmacht@us.ibm.com>
 IBM <contact@IBM.com> Michel Normand <normand@linux.vnet.ibm.com>
+IBM <contact@IBM.com> Naman Munet <Naman.Munet@ibm.com>
 IBM <contact@IBM.com> Naveen Naidu <naveen.naidu@ibm.com>
 IBM <contact@IBM.com> Neeraj Pratap Singh <Neeraj.Pratap.Singh1@ibm.com>
 IBM <contact@IBM.com> Or Ozeri <oro@il.ibm.com>
 IBM <contact@IBM.com> Paul Cuzner <pcuzner@ibm.com>
+IBM <contact@IBM.com> Prachi Goel <PRACHI.GOEL2@ibm.com>
+IBM <contact@IBM.com> Puja Shahu <puja-shahu.omprakash@ibm.com>
 IBM <contact@IBM.com> Samuel Matzek <smatzek@us.ibm.com>
 IBM <contact@IBM.com> Shraddha Agrawal <shraddhaag@ibm.com>
+IBM <contact@IBM.com> Kushal Deb <Kushal.Deb@ibm.com>
+IBM <contact@IBM.com> Shweta Bhosale <Shweta.Bhosale1@ibm.com>
+IBM <contact@IBM.com> Patrick Donnelly <pdonnell@ibm.com>
 IBM <contact@IBM.com> Sunil Angadi <Sunil.Angadi@ibm.com>
 IBM <contact@IBM.com> Teoman Onay <tonay@ibm.com>
 IBM <contact@ibm.com> Ulrich Weigand <ulrich.weigand@de.ibm.com>
@@ -582,6 +591,7 @@ Red Hat <contact@redhat.com> Adam King <adking@redhat.com>
 Red Hat <contact@redhat.com> Adam King <adking@redhat.com>
 Red Hat <contact@redhat.com> Adam Kupczyk <akupczyk@redhat.com>
 Red Hat <contact@redhat.com> Ademar de Souza Reis Jr <areis@redhat.com>
+Red Hat <contact@redhat.com> Afreen Misbah <afrahman@redhat.com>
 Red Hat <contact@redhat.com> Aishwarya Mathuria <amathuri@redhat.com>
 Red Hat <contact@redhat.com> Albin Antony <aantony@redhat.com>
 Red Hat <contact@redhat.com> Alex Elder <aelder@redhat.com>
@@ -618,6 +628,7 @@ Red Hat <contact@redhat.com> Deepika Upadhyay <dupadhya@redhat.com>
 Red Hat <contact@redhat.com> Dhairya Parmar <dparmar@redhat.com>
 Red Hat <contact@redhat.com> Dimitri Savineau <dsavinea@redhat.com>
 Red Hat <contact@redhat.com> Divyansh Kamboj <dkamboj@redhat.com>
+Red Hat <contact@redhat.com> Dnyaneshwari talwekar <dtalweka@redhat.com>
 Red Hat <contact@redhat.com> Douglas Fuller <dfuller@redhat.com>
 Red Hat <contact@redhat.com> Ernesto Puerta <epuertat@redhat.com>
 Red Hat <contact@redhat.com> Erwan Velu <erwan@redhat.com>
@@ -683,6 +694,7 @@ Red Hat <contact@redhat.com> Mike Hackett <mhackett@redhat.com>
 Red Hat <contact@redhat.com> Mike Perez <miperez@redhat.com>
 Red Hat <contact@redhat.com> Milan Broz <mbroz@redhat.com>
 Red Hat <contact@redhat.com> Milind Changire <mchangir@redhat.com>
+Red Hat <contact@redhat.com> Naman Munet <nmunet@redhat.com>
 Red Hat <contact@redhat.com> Nathan Weinberg <nweinber@redhat.com>
 Red Hat <contact@redhat.com> Neeraj Pratap Singh <neesingh@redhat.com>
 Red Hat <contact@redhat.com> Neha Ojha <nojha@redhat.com>
@@ -706,9 +718,11 @@ Red Hat <contact@redhat.com> Pere Diaz Bou <pdiazbou@redhat.com>
 Red Hat <contact@redhat.com> Pete Zaitcev <zaitcev@redhat.com>
 Red Hat <contact@redhat.com> Petr Lautrbach <plautrba@redhat.com>
 Red Hat <contact@redhat.com> Petr Machata <pmachata@redhat.com>
+Red Hat <contact@redhat.com> Prachi prgoel@redhat.com
 Red Hat <contact@redhat.com> Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
 Red Hat <contact@redhat.com> Prashant D <pdhange@redhat.com>
 Red Hat <contact@redhat.com> Pritha Srivastava <prsrivas@redhat.com>
+Red Hat <contact@redhat.com> Puja Shahu <pshahu@redhat.com>
 Red Hat <contact@redhat.com> Radoslaw Zarzynski <rzarzynski@redhat.com>
 Red Hat <contact@redhat.com> Rafael Quintero <rquinter@redhat.com>
 Red Hat <contact@redhat.com> Ramakrishnan Periyasamy <rperiyas@redhat.com>
diff --git a/.peoplemap b/.peoplemap
index 507f50edb43..418e8505fb4 100644
--- a/.peoplemap
+++ b/.peoplemap
@@ -73,5 +73,5 @@ Yehuda Sadeh <ysadehwe@redhat.com> Yehuda Sadeh <yehuda@inktank.com>
 Yuri Weinstein <yuriw@redhat.com> Yuri Weinstein <yuri.weinstein@inktank.com>
 Zhi Zhang <zhangz.david@outlook.com> Zhi (David) Zhang <zhangz@yahoo-inc.com>
 Zheng Yin <zhengyin@huayun.com> Zheng Yin <zhengyin@chinac.com>
-Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
+Patrick Donnelly <pdonnell@ibm.com> Patrick Donnelly <pdonnell@redhat.com> Patrick Donnelly <batrick@batbytes.com>
 Myoungwon Oh <myoungwon.oh@samsung.com> Myoungwon Oh <omwmw@sk.com> Myoungwon Oh <ohmyoungwon@gmail.com>
diff --git a/CodingStyle b/CodingStyle
index 659298f0e5a..019d23c7703 100644
--- a/CodingStyle
+++ b/CodingStyle
@@ -108,6 +108,12 @@ by section.
    portability since `#pragma once` is widely supported and is known
    to work on GCC and Clang.
 
+* Header Files -> Forward declarations:
+
+    Forward declarations of structs, unions, classes and enums can be
+    used to reduce header dependencies.  This speeds up compile times
+    because the compiler has to process less code.
+
 
 The following guidelines have not been followed in the legacy code,
 but are worth mentioning and should be followed strictly for new code:
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 1a4e26e747f..d82ed125d92 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -25,6 +25,17 @@
    - osd_op_num_shards_hdd = 1 (was 5)
    - osd_op_num_threads_per_shard_hdd = 5 (was 1)
   For more details see https://tracker.ceph.com/issues/66289.
+* MGR: MGR's always-on modulues/plugins can now be force-disabled. This can be
+  necessary in cases where MGR(s) needs to be prevented from being flooded by
+  the module commands when coresponding Ceph service is down/degraded.
+
+* CephFS: Modifying the FS setting variable "max_mds" when a cluster is
+  unhealthy now requires users to pass the confirmation flag
+  (--yes-i-really-mean-it). This has been added as a precaution to tell the
+  users that modifying "max_mds" may not help with troubleshooting or recovery
+  effort. Instead, it might further destabilize the cluster.
+
+
 
 >=19.0.0
 
diff --git a/README.md b/README.md
index e51621ca8b8..56257697e9a 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ See https://ceph.com/ for current information about Ceph.
 
 ## Status
 
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/2220/badge)](https://www.bestpractices.dev/projects/2220)
 [![Issue Backporting](https://github.com/ceph/ceph/actions/workflows/create-backport-trackers.yml/badge.svg)](https://github.com/ceph/ceph/actions/workflows/create-backport-trackers.yml)
 
 ## Contributing Code
diff --git a/SubmittingPatches-backports.rst b/SubmittingPatches-backports.rst
index 0f96aec65c4..bb55088cb5f 100644
--- a/SubmittingPatches-backports.rst
+++ b/SubmittingPatches-backports.rst
@@ -121,14 +121,11 @@ If you do not have sufficient permissions to modify any field of the tracker
 issue, just add a comment describing what changes you would like to make.
 Someone with permissions will make the necessary modifications on your behalf.
 
-For straightforward backports, that's all that you (as the developer of the fix)
-need to do. Volunteers from the `Stable Releases and Backports team`_ will
-proceed to create Backport issues to track the necessary backports and stage the
-backports by opening GitHub PRs with the cherry-picks. If you don't want to
-wait, and provided you have sufficient permissions at https://tracker.ceph.com,
-you can `create Backport tracker issues` and `stage backports`_ yourself. In
-that case, read on.
-
+Authors of pull requests are responsible for creating associated backport pull
+requests. As long as you have sufficient permissions at
+https://tracker.ceph.com, you can `create Backport tracker issues` and `stage
+backports`_ yourself. Read these linked sections to learn how to create
+backport tracker issues and how to stage backports: 
 
 .. _`create backport tracker issues`:
 .. _`backport tracker issue`:
@@ -146,10 +143,7 @@ issues can be created in the backport tracker issue for tracking the backporting
 
 Under ordinary circumstances, the developer who merges the ``main`` PR will flag
 the ``main`` branch tracker issue for backport by changing the Status to "Pending
-Backport", and volunteers from the `Stable Releases and Backports team`_
-periodically create backport tracker issues by running the
-``backport-create-issue`` script. They also do the actual backporting. But that
-does take time and you may not want to wait.
+Backport". 
 
 You might be tempted to forge ahead and create the backport issues yourself.
 Please don't do that - it is difficult (bordering on impossible) to get all the
@@ -360,20 +354,11 @@ Once the backport PR is open, the first order of business is to set the
 Milestone tag to the stable release the backport PR is targeting. For example,
 if the PR is targeting "nautilus", set the Milestone tag to "nautilus".
 
-If you don't have sufficient GitHub permissions to set the Milestone, don't
-worry. Members of the `Stable Releases and Backports team`_ periodically run
-a script (``ceph-backport.sh --milestones``) which scans all PRs targetting stable
-branches and automatically adds the correct Milestone tag if it is missing.
-
 Next, check which component label was applied to the ``main`` PR corresponding to
 this backport, and double-check that that label is applied to the backport PR as
 well. For example, if the ``main`` PR carries the component label "core", the
 backport PR should also get that label.
 
-In general, it is the responsibility of the `Stable Releases and Backports
-team`_ to ensure that backport PRs are properly labelled. If in doubt, just
-leave the labelling to them.
-
 .. _`backport PR reviewing`:
 .. _`backport PR testing`:
 .. _`backport PR merging`:
@@ -381,9 +366,8 @@ leave the labelling to them.
 Reviewing, testing, and merging of backport PRs
 -----------------------------------------------
 
-Once your backport PR is open and the Milestone is set properly, the
-`Stable Releases and Backports team` will take care of getting the PR
-reviewed and tested. Once the PR is reviewed and tested, it will be merged.
+Once your backport PR is open, it will be reviewed and tested. When the PR has
+been reviewed and tested, it will be merged.
 
 If you would like to facilitate this process, you can solicit reviews and run
 integration tests on the PR. In this case, add comments to the PR describing the
@@ -394,22 +378,3 @@ it will be merged. Even if you have sufficient GitHub permissions to merge the
 PR, please do *not* merge it yourself. (Uncontrolled merging to stable branches
 unnecessarily complicates the release preparation process, which is done by
 volunteers.)
-
-
-Stable Releases and Backports team
-----------------------------------
-
-Ceph has a `Stable Releases and Backports`_ team, staffed by volunteers,
-which is charged with maintaining the stable releases and backporting bugfixes
-from the ``main`` branch to them. (That team maintains a wiki, accessible by
-clicking the `Stable Releases and Backports`_ link, which describes various
-workflows in the backporting lifecycle.)
-
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
-
-Ordinarily, it is enough to fill out the "Backport" field in the bug (tracker
-issue). The volunteers from the Stable Releases and Backports team will
-backport the fix, run regression tests on it, and include it in one or more
-future point releases.
-
-
diff --git a/cmake/modules/BuildISAL.cmake b/cmake/modules/BuildISAL.cmake
new file mode 100644
index 00000000000..6df15bc5bb8
--- /dev/null
+++ b/cmake/modules/BuildISAL.cmake
@@ -0,0 +1,42 @@
+# use an ExternalProject to build isa-l using its makefile
+function(build_isal)
+  set(isal_BINARY_DIR ${CMAKE_BINARY_DIR}/src/isa-l)
+  set(isal_INSTALL_DIR ${isal_BINARY_DIR}/install)
+  set(isal_INCLUDE_DIR "${isal_INSTALL_DIR}/include")
+  set(isal_LIBRARY "${isal_INSTALL_DIR}/lib/libisal.a")
+
+  # this include directory won't exist until the install step, but the
+  # imported targets need it early for INTERFACE_INCLUDE_DIRECTORIES
+  file(MAKE_DIRECTORY "${isal_INCLUDE_DIR}")
+
+  set(configure_cmd env CC=${CMAKE_C_COMPILER} ./configure --prefix=${isal_INSTALL_DIR})
+  # build a static library with -fPIC that we can link into crypto/compressor plugins
+  list(APPEND configure_cmd --with-pic --enable-static --disable-shared)
+
+  # clear the DESTDIR environment variable from debian/rules,
+  # because it messes with the internal install paths of arrow's bundled deps
+  set(NO_DESTDIR_COMMAND ${CMAKE_COMMAND} -E env --unset=DESTDIR)
+
+  include(ExternalProject)
+  ExternalProject_Add(isal_ext
+    SOURCE_DIR "${PROJECT_SOURCE_DIR}/src/isa-l"
+    CONFIGURE_COMMAND ./autogen.sh COMMAND ${configure_cmd}
+    BUILD_COMMAND ${NO_DESTDIR_COMMAND} make -j3
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${isal_LIBRARY}
+    INSTALL_COMMAND ${NO_DESTDIR_COMMAND} make install
+    UPDATE_COMMAND ""
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    LOG_INSTALL ON
+    LOG_MERGED_STDOUTERR ON
+    LOG_OUTPUT_ON_FAILURE ON)
+
+  # add imported library target ISAL::Crypto
+  add_library(ISAL::ISAL STATIC IMPORTED GLOBAL)
+  add_dependencies(ISAL::ISAL isal_ext)
+  set_target_properties(ISAL::ISAL PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES ${isal_INCLUDE_DIR}
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION ${isal_LIBRARY})
+endfunction()
diff --git a/cmake/modules/BuildISALCrypto.cmake b/cmake/modules/BuildISALCrypto.cmake
new file mode 100644
index 00000000000..26fb4a8f9cd
--- /dev/null
+++ b/cmake/modules/BuildISALCrypto.cmake
@@ -0,0 +1,31 @@
+# use an ExternalProject to build isa-l_crypto using its makefile
+function(build_isal_crypto)
+  set(ISAL_CRYPTO_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
+  set(ISAL_CRYPTO_INCLUDE_DIR "${ISAL_CRYPTO_SOURCE_DIR}/include")
+  set(ISAL_CRYPTO_LIBRARY "${ISAL_CRYPTO_SOURCE_DIR}/bin/isa-l_crypto.a")
+
+  include(FindMake)
+  find_make("MAKE_EXECUTABLE" "make_cmd")
+
+  include(ExternalProject)
+  ExternalProject_Add(isal_crypto_ext
+    SOURCE_DIR ${ISAL_CRYPTO_SOURCE_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ${make_cmd} -f <SOURCE_DIR>/Makefile.unx
+    BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${ISAL_CRYPTO_LIBRARY}
+    INSTALL_COMMAND ""
+    UPDATE_COMMAND ""
+    LOG_CONFIGURE ON
+    LOG_BUILD ON
+    LOG_MERGED_STDOUTERR ON
+    LOG_OUTPUT_ON_FAILURE ON)
+
+  # add imported library target ISAL::Crypto
+  add_library(ISAL::Crypto STATIC IMPORTED GLOBAL)
+  add_dependencies(ISAL::Crypto isal_crypto_ext)
+  set_target_properties(ISAL::Crypto PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES ${ISAL_CRYPTO_INCLUDE_DIR}
+    IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+    IMPORTED_LOCATION ${ISAL_CRYPTO_LIBRARY})
+endfunction()
diff --git a/container/Containerfile b/container/Containerfile
new file mode 100644
index 00000000000..2f75c8c6ce6
--- /dev/null
+++ b/container/Containerfile
@@ -0,0 +1,209 @@
+ARG FROM_IMAGE="quay.io/centos/centos:stream9"
+FROM $FROM_IMAGE
+
+# allow FROM_IMAGE to be visible inside this stage
+ARG FROM_IMAGE
+
+# Ceph branch name
+ARG CEPH_REF="main"
+
+# Ceph SHA1
+ARG CEPH_SHA1
+
+# Ceph git repo (ceph-ci.git or ceph.git)
+ARG CEPH_GIT_REPO
+
+# (optional) Define the baseurl= for the ganesha.repo
+ARG GANESHA_REPO_BASEURL="https://buildlogs.centos.org/centos/\$releasever-stream/storage/\$basearch/nfsganesha-5/"
+
+# (optional) Set to "crimson" to install crimson packages.
+ARG OSD_FLAVOR="default"
+
+# (optional) Should be 'true' for CI builds (pull from shaman, etc.)
+ARG CI_CONTAINER="true"
+
+RUN /bin/echo -e "\
+FROM_IMAGE: ${FROM_IMAGE}\n\
+CEPH_REF: ${CEPH_REF}\n\
+GANESHA_REPO_BASEURL: ${GANESHA_REPO_BASEURL} \n\
+OSD_FLAVOR: ${OSD_FLAVOR} \n\
+CI_CONTAINER: ${CI_CONTAINER}"
+
+# Other labels are set automatically by container/build github action
+# See: https://github.com/opencontainers/image-spec/blob/main/annotations.md
+LABEL org.opencontainers.image.authors="Ceph Release Team <ceph-maintainers@ceph.io>" \
+      org.opencontainers.image.documentation="https://docs.ceph.com/"
+
+LABEL \
+FROM_IMAGE=${FROM_IMAGE} \
+CEPH_REF=${CEPH_REF} \
+CEPH_SHA1=${CEPH_SHA1} \
+CEPH_GIT_REPO=${CEPH_GIT_REPO} \
+GANESHA_REPO_BASEURL=${GANESHA_REPO_BASEURL} \
+OSD_FLAVOR=${OSD_FLAVOR}
+
+
+#===================================================================================================
+# Install ceph and dependencies, and clean up
+# IMPORTANT: in official builds, use '--squash' build option to keep image as small as possible
+#   keeping run steps separate makes local rebuilds quick, but images are big without squash option
+#===================================================================================================
+
+# Pre-reqs
+RUN dnf install -y --setopt=install_weak_deps=False epel-release jq
+
+# Add NFS-Ganesha repo
+RUN \
+    echo "[ganesha]" > /etc/yum.repos.d/ganesha.repo && \
+    echo "name=ganesha" >> /etc/yum.repos.d/ganesha.repo && \
+    echo "baseurl=${GANESHA_REPO_BASEURL}" >> /etc/yum.repos.d/ganesha.repo && \
+    echo "gpgcheck=0" >> /etc/yum.repos.d/ganesha.repo && \
+    echo "enabled=1" >> /etc/yum.repos.d/ganesha.repo
+
+# ISCSI repo
+RUN set -x && \
+    curl -s -L https://shaman.ceph.com/api/repos/tcmu-runner/main/latest/centos/9/repo?arch=$(arch) -o /etc/yum.repos.d/tcmu-runner.repo && \
+    case "${CEPH_REF}" in \
+        quincy|reef) \
+            curl -s -L https://download.ceph.com/ceph-iscsi/3/rpm/el9/ceph-iscsi.repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
+            ;;\
+        main|*) \
+            curl -s -L https://shaman.ceph.com/api/repos/ceph-iscsi/main/latest/centos/9/repo -o /etc/yum.repos.d/ceph-iscsi.repo ;\
+            ;;\
+    esac
+
+# Ceph repo
+RUN set -x && \
+    rpm --import 'https://download.ceph.com/keys/release.asc' && \
+    ARCH=$(arch); if [ "${ARCH}" == "aarch64" ]; then ARCH="arm64"; fi ;\
+    IS_RELEASE=0 ;\
+    if [[ "${CI_CONTAINER}" == "true" ]] ; then \
+        # TODO: this can return different ceph builds (SHA1) for x86 vs. arm runs. is it important to fix?
+        REPO_URL=$(curl -s "https://shaman.ceph.com/api/search/?project=ceph&distros=centos/9/${ARCH}&flavor=${OSD_FLAVOR}&ref=${CEPH_REF}&sha1=latest" | jq -r .[0].url) ;\
+    else \
+        IS_RELEASE=1 ;\
+        REPO_URL="http://download.ceph.com/rpm-${CEPH_REF}/el9/" ;\
+    fi && \
+    rpm -Uvh "$REPO_URL/noarch/ceph-release-1-${IS_RELEASE}.el9.noarch.rpm"
+
+# Copr repos
+# scikit for mgr-diskprediction-local
+# ref: https://github.com/ceph/ceph-container/pull/1821
+RUN \
+    dnf install -y --setopt=install_weak_deps=False dnf-plugins-core && \
+    dnf copr enable -y tchaikov/python-scikit-learn
+
+# Update package mgr
+RUN dnf update -y --setopt=install_weak_deps=False
+
+# Define and install packages
+# General
+RUN echo "ca-certificates" > packages.txt
+# Ceph
+# TODO: remove lua-devel and luarocks once they are present in ceph.spec.in
+#       ref: https://github.com/ceph/ceph/pull/54575#discussion_r1401199635
+RUN echo \
+"ceph-common \
+ceph-exporter \
+ceph-grafana-dashboards \
+ceph-immutable-object-cache \
+ceph-mds \
+ceph-mgr-cephadm \
+ceph-mgr-dashboard \
+ceph-mgr-diskprediction-local \
+ceph-mgr-k8sevents \
+ceph-mgr-rook \
+ceph-mgr \
+ceph-mon \
+ceph-osd \
+ceph-radosgw lua-devel luarocks \
+ceph-volume \
+cephfs-mirror \
+cephfs-top \
+kmod \
+libradosstriper1 \
+rbd-mirror" \
+>> packages.txt
+
+# Optional crimson package(s)
+RUN if [ "${OSD_FLAVOR}" == "crimson" ]; then \
+    echo "ceph-crimson-osd" >> packages.txt ; \
+fi
+
+# Ceph "Recommends"
+RUN echo "nvme-cli python3-saml smartmontools" >> packages.txt
+# NFS-Ganesha
+RUN echo "\
+dbus-daemon \
+nfs-ganesha-ceph \
+nfs-ganesha-rados-grace \
+nfs-ganesha-rados-urls \
+nfs-ganesha-rgw \
+nfs-ganesha \
+rpcbind \
+sssd-client" >> packages.txt
+
+# ISCSI
+RUN echo "ceph-iscsi tcmu-runner python3-rtslib" >> packages.txt
+
+# Ceph-CSI
+# TODO: coordinate with @Madhu-1 to have Ceph-CSI install these itself if unused by ceph
+#       @adk3798 does cephadm use these?
+RUN echo "attr ceph-fuse rbd-nbd"  >> packages.txt
+
+# Rook (only if packages must be in ceph container image)
+RUN echo "systemd-udev" >> packages.txt
+
+# Util packages (should be kept to only utils that are truly very useful)
+# 'sgdisk' (from gdisk) is used in docs and scripts for clearing disks (could be a risk? @travisn @guits @ktdreyer ?)
+# 'ps' (from procps-ng) and 'hostname' are very valuable for debugging and CI
+# TODO: remove sg3_utils once they are moved to ceph.spec.in with libstoragemgmt
+#       ref: https://github.com/ceph/ceph-container/pull/2013#issuecomment-1248606472
+RUN echo "gdisk hostname procps-ng sg3_utils e2fsprogs lvm2 gcc" >> packages.txt
+
+# scikit
+RUN echo "python3-scikit-learn" >> packages.txt
+
+# ceph-node-proxy
+RUN echo "ceph-node-proxy" >> packages.txt
+
+RUN echo "=== PACKAGES TO BE INSTALLED ==="; cat packages.txt
+RUN echo "=== INSTALLING ===" ; \
+dnf install -y --setopt=install_weak_deps=False --setopt=skip_missing_names_on_install=False --enablerepo=crb $(cat packages.txt)
+
+# XXX why isn't this done in the ganesha package?
+RUN mkdir -p /var/run/ganesha
+
+# Disable sync with udev since the container can not contact udev
+RUN \
+    sed -i -e 's/udev_rules = 1/udev_rules = 0/' \
+           -e 's/udev_sync = 1/udev_sync = 0/' \
+           -e 's/obtain_device_list_from_udev = 1/obtain_device_list_from_udev = 0/' \
+        /etc/lvm/lvm.conf && \
+    # validate the sed command worked as expected
+    grep -sqo "udev_sync = 0" /etc/lvm/lvm.conf && \
+    grep -sqo "udev_rules = 0" /etc/lvm/lvm.conf && \
+    grep -sqo "obtain_device_list_from_udev = 0" /etc/lvm/lvm.conf
+
+# CLEAN UP!
+RUN set -x && \
+    dnf clean all && \
+    rm -rf /var/cache/dnf/* && \
+    rm -rf /var/lib/dnf/* && \
+    rm -f /var/lib/rpm/__db* && \
+    # remove unnecessary files with big impact
+    rm -rf /etc/selinux /usr/share/{doc,man,selinux} && \
+    # don't keep compiled python binaries
+    find / -xdev \( -name "*.pyc" -o -name "*.pyo" \) -delete
+
+# Verify that the packages installed haven't been accidentally cleaned, then
+# clean the package list and re-clean unnecessary RPM database files
+RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.txt
+
+#
+# Set some envs in the container for quickly inspecting details about the build at runtime
+ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \
+    CEPH_REF="${CEPH_REF}" \
+    CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \
+    FROM_IMAGE="${FROM_IMAGE}"
+
diff --git a/container/build.sh b/container/build.sh
new file mode 100755
index 00000000000..5edf469d2d2
--- /dev/null
+++ b/container/build.sh
@@ -0,0 +1,175 @@
+#!/bin/bash -ex
+# vim: ts=4 sw=4 expandtab
+
+# repo auth with write perms must be present (this script does not log into
+# CONTAINER_REPO_HOSTNAME and CONTAINER_REPO_ORGANIZATION).
+# If NO_PUSH is set, no login is necessary
+
+
+CFILE=${1:-Containerfile}
+shift || true
+
+usage() {
+    cat << EOF
+$0 [containerfile] (defaults to 'Containerfile')
+For a CI build (from ceph-ci.git, built and pushed to shaman):
+CI_CONTAINER: must be 'true'
+FLAVOR (OSD flavor, default or crimson)
+BRANCH (of Ceph. <remote>/<ref>)
+CEPH_SHA1 (of Ceph)
+ARCH (of build host, and resulting container)
+CONTAINER_REPO_HOSTNAME (quay.ceph.io, for CI, for instance)
+CONTAINER_REPO_ORGANIZATION (ceph-ci, for CI, for instance)
+CONTAINER_REPO_USERNAME
+CONTAINER_REPO_PASSWORD
+
+For a release build: (from ceph.git, built and pushed to download.ceph.com)
+CI_CONTAINER: must be 'false'
+and you must also add
+VERSION (for instance, 19.1.0) for tagging the image
+
+You can avoid the push step (for testing) by setting NO_PUSH to anything
+EOF
+}
+
+CI_CONTAINER=${CI_CONTAINER:-false}
+FLAVOR=${FLAVOR:-default}
+# default: current checked-out branch
+BRANCH=${BRANCH:-$(git rev-parse --abbrev-ref HEAD)}
+# default: current checked-out branch
+CEPH_SHA1=${CEPH_SHA1:-$(git rev-parse HEAD)}
+# default: build host arch
+ARCH=${ARCH:-$(arch)}
+if [[ "${ARCH}" == "aarch64" ]] ; then ARCH=arm64; fi
+if [[ ${CI_CONTAINER} == "true" ]] ; then
+    CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.ceph.io}
+    CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph/ceph-${ARCH}}
+else
+    CONTAINER_REPO_HOSTNAME=${CONTAINER_REPO_HOSTNAME:-quay.io}
+    CONTAINER_REPO_ORGANIZATION=${CONTAINER_REPO_ORGANIZATION:-ceph/ceph}
+    # default: most-recent annotated tag
+    VERSION=${VERSION:-$(git describe --abbrev=0)}
+fi
+
+# check for existence of all required variables
+: "${CI_CONTAINER:?}"
+: "${FLAVOR:?}"
+: "${BRANCH:?}"
+: "${CEPH_SHA1:?}"
+: "${ARCH:?}"
+: "${CONTAINER_REPO_HOSTNAME:?}"
+: "${CONTAINER_REPO_ORGANIZATION:?}"
+: "${CONTAINER_REPO_USERNAME:?}"
+: "${CONTAINER_REPO_PASSWORD:?}"
+if [[ ${CI_CONTAINER} != "true" ]] ; then ${VERSION:?}; fi
+
+# check for valid repo auth (if pushing)
+ORGURL=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}
+MINIMAL_IMAGE=${ORGURL}/ceph:minimal-test
+if [[ ${NO_PUSH} != "true" ]] ; then
+    podman rmi ${MINIMAL_IMAGE} || true
+    echo "FROM scratch" | podman build -f - -t ${MINIMAL_IMAGE}
+    if ! podman push ${MINIMAL_IMAGE} ; then
+        echo "Not authenticated to ${ORGURL}; need docker/podman login?"
+        exit 1
+    fi
+    podman rmi ${MINIMAL_IMAGE} | true
+fi
+
+if [[ -z "${CEPH_GIT_REPO}" ]] ; then
+    if [[ ${CI_CONTAINER} == "true" ]]; then
+        CEPH_GIT_REPO=https://github.com/ceph/ceph-ci.git
+    else
+        CEPH_GIT_REPO=https://github.com/ceph/ceph.git
+    fi
+fi
+
+# BRANCH will be, say, origin/main.  remove <remote>/
+BRANCH=${BRANCH##*/}
+
+podman build --pull=newer --squash -f $CFILE -t build.sh.output \
+    --build-arg FROM_IMAGE=${FROM_IMAGE:-quay.io/centos/centos:stream9} \
+    --build-arg CEPH_SHA1=${CEPH_SHA1} \
+    --build-arg CEPH_GIT_REPO=${CEPH_GIT_REPO} \
+    --build-arg CEPH_REF=${BRANCH:-main} \
+    --build-arg OSD_FLAVOR=${FLAVOR:-default} \
+    --build-arg CI_CONTAINER=${CI_CONTAINER:-default} \
+    2>&1 
+
+image_id=$(podman image ls localhost/build.sh.output --format '{{.ID}}')
+
+# grab useful image attributes for building the tag
+#
+# the variable settings are prefixed with "export CEPH_CONTAINER_" so that
+# an eval or . can be used to put them into the environment
+#
+# PATH is removed from the output as it would cause problems for this
+# parent script and its children
+#
+# notes:
+#
+# we want .Architecture and everything in .Config.Env
+#
+# printf will not accept "\n" (is this a podman bug?)
+# so construct vars with two calls to podman inspect, joined by a newline,
+# so that vars will get the output of the first command, newline, output
+# of the second command
+#
+vars="$(podman inspect -f '{{printf "export CEPH_CONTAINER_ARCH=%v" .Architecture}}' ${image_id})
+$(podman inspect -f '{{range $index, $value := .Config.Env}}export CEPH_CONTAINER_{{$value}}{{println}}{{end}}' ${image_id})"
+vars="$(echo "${vars}" | grep -v PATH)"
+eval ${vars}
+
+# remove everything up to and including the last slash
+fromtag=${CEPH_CONTAINER_FROM_IMAGE##*/}
+# translate : to -
+fromtag=${fromtag/:/-}
+builddate=$(date +%Y%m%d)
+local_tag=${fromtag}-${CEPH_CONTAINER_CEPH_REF}-${CEPH_CONTAINER_ARCH}-${builddate}
+
+repopath=${CONTAINER_REPO_HOSTNAME}/${CONTAINER_REPO_ORGANIZATION}
+
+if [[ ${CI_CONTAINER} == "true" ]] ; then
+    # ceph-ci conventions for remote tags:
+    # requires ARCH, BRANCH, CEPH_SHA1, FLAVOR
+    full_repo_tag=$repopath/ceph:${BRANCH}-${fromtag}-${ARCH}-devel
+    branch_repo_tag=$repopath/ceph:${BRANCH}
+    sha1_repo_tag=$repopath/ceph:${CEPH_SHA1}
+
+    if [[ "${ARCH}" == "arm64" ]] ; then
+        branch_repo_tag=${branch_repo_tag}-arm64
+        sha1_repo_tag=${sha1_repo_tag}-arm64
+    fi
+
+    podman tag ${image_id} ${full_repo_tag}
+    podman tag ${image_id} ${branch_repo_tag}
+    podman tag ${image_id} ${sha1_repo_tag}
+
+    if [[ ${FLAVOR} == "crimson" && ${ARCH} == "x86_64" ]] ; then
+        sha1_flavor_repo_tag=${sha1_repo_tag}-${FLAVOR}
+        podman tag ${image_id} ${sha1_flavor_repo_tag}
+        if [[ -z "${NO_PUSH}" ]] ; then
+            podman push ${sha1_flavor_repo_tag}
+        fi
+        exit
+    fi
+
+    if [[ -z "${NO_PUSH}" ]] ; then
+        podman push ${full_repo_tag}
+        podman push ${branch_repo_tag}
+        podman push ${sha1_repo_tag}
+    fi
+else
+    #
+    # non-CI build.  Tags are like v19.1.0-20240701
+    # push to quay.ceph.io/ceph/prerelease
+    #
+    version_tag=${repopath}/prerelease/ceph-${ARCH}:${VERSION}-${builddate}
+
+    podman tag ${image_id} ${version_tag}
+    if [[ -z "${NO_PUSH}" ]] ; then
+        podman push ${image_id} ${version_tag}
+    fi
+fi
+
+
diff --git a/container/make-manifest-list.py b/container/make-manifest-list.py
new file mode 100755
index 00000000000..010dcaed2b7
--- /dev/null
+++ b/container/make-manifest-list.py
@@ -0,0 +1,164 @@
+#!/usr/bin/python3
+#
+# make a combined "manifest-list" container out of two arch-specific containers
+# searches for latest tags on HOST/{AMD,ARM}64_REPO, makes sure they refer
+# to the same Ceph SHA1, and creates a manifest-list ("fat") image on
+# MANIFEST_HOST/MANIFEST_REPO with the 'standard' set of tags.
+#
+# uses scratch local manifest LOCALMANIFEST, will be destroyed if present
+
+from datetime import datetime
+import functools
+import json
+import os
+import re
+import subprocess
+import sys
+
+# optional env vars (will default if not set)
+
+OPTIONAL_VARS = (
+    'HOST',
+    'AMD64_REPO',
+    'ARM64_REPO',
+    'MANIFEST_HOST',
+    'MANIFEST_REPO',
+)
+
+# Manifest image.  Will be destroyed if already present.
+LOCALMANIFEST = 'localhost/m'
+
+
+def dump_vars(names, vardict):
+    for name in names:
+        print(f'{name}: {vardict[name]}', file=sys.stderr)
+
+
+def run_command(args):
+    print(f'running {args}', file=sys.stderr)
+    if not isinstance(args, list):
+        args = args.split()
+    try:
+        result = subprocess.run(
+            args,
+            capture_output=True,
+            text=True,
+            check=True)
+        return True, result.stdout, result.stderr
+
+    except subprocess.CalledProcessError as e:
+        print(f"Command '{e.cmd}' returned {e.returncode}")
+        print("Error output:")
+        print(e.stderr)
+        return False, result.stdout, result.stderr
+
+
+def get_command_output(args):
+    success, stdout, stderr = run_command(args)
+    return (stdout if success else None)
+
+
+def run_command_show_failure(args):
+    success, stdout, stderr = run_command(args)
+    if not success:
+        print(f'{args} failed:', file=sys.stderr)
+        print(f'stdout:\n{stdout}')
+        print(f'stderr:\n{stderr}')
+    return success
+
+
+@functools.lru_cache
+def get_latest_tag(path):
+    latest_tag = json.loads(
+        get_command_output(f'skopeo list-tags docker://{path}')
+    )['Tags'][-1]
+    return latest_tag
+
+
+@functools.lru_cache
+def get_image_inspect(path):
+    info = json.loads(
+        get_command_output(f'skopeo inspect docker://{path}')
+    )
+    return info
+
+
+def get_sha1(info):
+    return info['Labels']['GIT_COMMIT']
+
+
+def main():
+    host = os.environ.get('HOST', 'quay.io')
+    amd64_repo = os.environ.get('AMD64_REPO', 'ceph/ceph-amd64')
+    arm64_repo = os.environ.get('ARM64_REPO', 'ceph/ceph-arm64')
+    manifest_host = os.environ.get('MANIFEST_HOST', host)
+    manifest_repo = os.environ.get('MANIFEST_REPO', 'ceph/ceph')
+    dump_vars(
+        ('host',
+         'amd64_repo',
+         'arm64_repo',
+         'manifest_host',
+         'manifest_repo',
+         ),
+        locals())
+
+    repopaths = (
+        f'{host}/{amd64_repo}',
+        f'{host}/{arm64_repo}',
+    )
+    tags = [get_latest_tag(p) for p in repopaths]
+    print(f'latest tags: amd64:{tags[0]} arm64:{tags[1]}')
+
+    # check that version of latest tag matches
+    version_re = \
+        r'v(?P<major>\d+)\.(?P<minor>\d+)\.(?P<micro>\d+)-(?P<date>\d+)'
+    versions = list()
+    for tag in tags:
+        mo = re.match(version_re, tag)
+        ver = f'{mo.group("major")}.{mo.group("minor")}.{mo.group("micro")}'
+        versions.append(ver)
+    if versions[0] != versions[1]:
+        print(
+            f'version mismatch: amd64:{versions[0]} arm64:{versions[1]}',
+            file=sys.stderr,
+        )
+        return(1)
+
+    major, minor, micro = mo.group(1), mo.group(2), mo.group(3)
+    print(f'Ceph version: {major}.{minor}.{micro}', file=sys.stderr)
+
+    # check that ceph sha1 of two arch images matches
+    paths_with_tags = [f'{p}:{t}' for (p, t) in zip(repopaths, tags)]
+    info = [get_image_inspect(p) for p in paths_with_tags]
+    sha1s = [get_sha1(i) for i in info]
+    if sha1s[0] != sha1s[1]:
+        print(
+            f'sha1 mismatch: amd64: {sha1s[0]} arm64: {sha1s[1]}',
+            file=sys.stderr,
+        )
+        builddate = [i['Created'] for i in info]
+        print(
+            f'Build dates: amd64: {builddate[0]} arm64: {builddate[1]}',
+            file=sys.stderr,
+        )
+        return(1)
+
+    # create manifest list image with the standard list of tags
+    # ignore failure on manifest rm
+    run_command(f'podman manifest rm localhost/m')
+    run_command_show_failure(f'podman manifest create localhost/m')
+    for p in paths_with_tags:
+        run_command_show_failure(f'podman manifest add m {p}')
+    base = f'{manifest_host}/{manifest_repo}'
+    for t in (
+            f'v{major}',
+            f'v{major}.{minor}',
+            f'v{major}.{minor}.{micro}',
+            f'v{major}.{minor}.{micro}-{datetime.today().strftime("%Y%m%d")}',
+        ):
+        run_command_show_failure(
+          f'podman manifest push localhost/m {base}:{t}')
+
+
+if (__name__ == '__main__'):
+    sys.exit(main())
diff --git a/doc/_ext/ceph_releases.py b/doc/_ext/ceph_releases.py
index 94e92ffdd6a..481c2a1b619 100644
--- a/doc/_ext/ceph_releases.py
+++ b/doc/_ext/ceph_releases.py
@@ -191,7 +191,7 @@ class ReleasesGantt(Directive):
 
 class CephTimeline(Directive):
     has_content = False
-    required_arguments = 3
+    required_arguments = 4
     optional_arguments = 0
     option_spec = {}
 
diff --git a/doc/cephadm/operations.rst b/doc/cephadm/operations.rst
index 3b117c1bd6a..420ee655ac8 100644
--- a/doc/cephadm/operations.rst
+++ b/doc/cephadm/operations.rst
@@ -734,3 +734,72 @@ Purge ceph daemons from all hosts in the cluster
 
   # For each host:
   cephadm rm-cluster --force --zap-osds --fsid <fsid>
+
+
+Replacing a device
+==================
+
+The ``ceph orch device replace`` command automates the process of replacing the underlying device of an OSD.
+Previously, this process required manual intervention at various stages.
+With this new command, all necessary operations are performed automatically, streamlining the replacement process
+and improving the overall user experience.
+
+.. note:: This only supports LVM-based deployed OSD(s)
+
+.. prompt:: bash #
+
+  ceph orch device replace <host> <device-path>
+
+In the case the device being replaced is shared by multiple OSDs (eg: DB/WAL device shared by multiple OSDs), the orchestrator will warn you.
+
+.. prompt:: bash #
+
+  [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd
+
+  Error EINVAL: /dev/vdd is a shared device.
+  Replacing /dev/vdd implies destroying OSD(s): ['0', '1'].
+  Please, *be very careful*, this can be a very dangerous operation.
+  If you know what you are doing, pass --yes-i-really-mean-it
+
+If you know what you are doing, you can go ahead and pass ``--yes-i-really-mean-it``.
+
+.. prompt:: bash #
+
+  [ceph: root@ceph /]# ceph orch device replace osd-1 /dev/vdd --yes-i-really-mean-it
+    Scheduled to destroy osds: ['6', '7', '8'] and mark /dev/vdd as being replaced.
+
+``cephadm`` will make ``ceph-volume`` zap and destroy all related devices and mark the corresponding OSD as ``destroyed`` so the
+different OSD(s) ID(s) will be preserved:
+
+.. prompt:: bash #
+
+  [ceph: root@ceph-1 /]# ceph osd tree
+    ID  CLASS  WEIGHT   TYPE NAME         STATUS     REWEIGHT  PRI-AFF
+    -1         0.97659  root default
+    -3         0.97659      host devel-1
+     0    hdd  0.29300          osd.0     destroyed   1.00000  1.00000
+     1    hdd  0.29300          osd.1     destroyed   1.00000  1.00000
+     2    hdd  0.19530          osd.2            up   1.00000  1.00000
+     3    hdd  0.19530          osd.3            up   1.00000  1.00000
+
+The device being replaced is finally seen as ``being replaced`` preventing ``cephadm`` from redeploying the OSDs too fast:
+
+.. prompt:: bash #
+
+  [ceph: root@ceph-1 /]# ceph orch device ls
+  HOST     PATH      TYPE  DEVICE ID   SIZE  AVAILABLE  REFRESHED  REJECT REASONS
+  osd-1  /dev/vdb  hdd               200G  Yes        13s ago
+  osd-1  /dev/vdc  hdd               200G  Yes        13s ago
+  osd-1  /dev/vdd  hdd               200G  Yes        13s ago    Is being replaced
+  osd-1  /dev/vde  hdd               200G  No         13s ago    Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
+  osd-1  /dev/vdf  hdd               200G  No         13s ago    Has a FileSystem, Insufficient space (<10 extents) on vgs, LVM detected
+
+If for any reason you need to clear the 'device replace header' on a device, then you can use ``ceph orch device replace <host> <device> --clear``:
+
+.. prompt:: bash #
+
+  [ceph: root@devel-1 /]# ceph orch device replace devel-1 /dev/vdk --clear
+  Replacement header cleared on /dev/vdk
+  [ceph: root@devel-1 /]#
+
+After that, ``cephadm`` will redeploy the OSD service spec within a few minutes (unless the service is set to ``unmanaged``).
diff --git a/doc/cephadm/services/mgmt-gateway.rst b/doc/cephadm/services/mgmt-gateway.rst
index 60129b28621..2b88d55952e 100644
--- a/doc/cephadm/services/mgmt-gateway.rst
+++ b/doc/cephadm/services/mgmt-gateway.rst
@@ -49,6 +49,55 @@ monitoring `mgmt-gateway` takes care of handling HA when several instances of Pr
 available. The reverse proxy will automatically detect healthy instances and use them to process user requests.
 
 
+High Availability for mgmt-gateway service
+==========================================
+
+In addition to providing high availability for the underlying backend services, the mgmt-gateway
+service itself can be configured for high availability, ensuring that the system remains resilient
+even if certain core components for the service fail.
+
+Multiple mgmt-gateway instances can be deployed in an active/standby configuration using keepalived
+for seamless failover. The `oauth2-proxy` service can be deployed as multiple stateless instances,
+with nginx acting as a load balancer across them using round-robin strategy. This setup removes
+single points of failure and enhances the resilience of the entire system.
+
+In this setup, the underlying internal services follow the same high availability mechanism. Instead of
+directly accessing the `mgmt-gateway` internal endpoint, services use the virtual IP specified in the spec.
+This ensures that the high availability mechanism for `mgmt-gateway` is transparent to other services.
+
+Example Configuration for High Availability
+
+To deploy the mgmt-gateway in a high availability setup, here is an example of the specification files required:
+
+`mgmt-gateway` Configuration:
+
+.. code-block:: yaml
+
+    service_type: mgmt-gateway
+    placement:
+      label: mgmt
+    spec:
+      enable_auth: true
+      virtual_ip: 192.168.100.220
+
+`Ingress` Configuration for Keepalived:
+
+.. code-block:: yaml
+
+    service_type: ingress
+    service_id: ingress-mgmt-gw
+    placement:
+      label: mgmt
+    virtual_ip: 192.168.100.220
+    backend_service: mgmt-gateway
+    keepalive_only: true
+
+The number of deployed instances is determined by the number of hosts with the mgmt label.
+The ingress is configured in `keepalive_only` mode, with labels ensuring that any changes to
+the mgmt-gateway daemons are replicated to the corresponding keepalived instances. Additionally,
+the `virtual_ip` parameter must be identical in both specifications.
+
+
 Accessing services with mgmt-gateway
 ====================================
 
@@ -123,9 +172,6 @@ The specification can then be applied by running the following command:
 Limitations
 ===========
 
-A non-exhaustive list of important limitations for the mgmt-gateway service follows:
-
-* High-availability configurations and clustering for the mgmt-gateway service itself are currently not supported.
 * Services must bind to the appropriate ports based on the applications being proxied. Ensure that there
   are no port conflicts that might disrupt service availability.
 
diff --git a/doc/cephadm/services/oauth2-proxy.rst b/doc/cephadm/services/oauth2-proxy.rst
index d1afb515ca2..a941b11e555 100644
--- a/doc/cephadm/services/oauth2-proxy.rst
+++ b/doc/cephadm/services/oauth2-proxy.rst
@@ -42,8 +42,10 @@ a secure and flexible authentication mechanism.
 
 High availability
 ==============================
-`oauth2-proxy` is designed to integrate with an external IDP hence login high availability is not the responsibility of this
-service. In squid release high availability for the service itself is not supported yet.
+In general, `oauth2-proxy` is used in conjunction with the `mgmt-gateway`. The `oauth2-proxy` service can be deployed as multiple
+stateless instances, with the `mgmt-gateway` (nginx reverse-proxy) handling load balancing across these instances using a round-robin strategy.
+Since oauth2-proxy integrates with an external identity provider (IDP), ensuring high availability for login is managed externally
+and not the responsibility of this service.
 
 
 Accessing services with oauth2-proxy
@@ -70,8 +72,7 @@ An `oauth2-proxy` service can be applied using a specification. An example in YA
     service_type: oauth2-proxy
     service_id: auth-proxy
     placement:
-      hosts:
-        - ceph0
+      label: mgmt
     spec:
      https_address: "0.0.0.0:4180"
      provider_display_name: "My OIDC Provider"
diff --git a/doc/cephadm/services/smb.rst b/doc/cephadm/services/smb.rst
index abd3f4343f0..cc36a61b9d5 100644
--- a/doc/cephadm/services/smb.rst
+++ b/doc/cephadm/services/smb.rst
@@ -4,8 +4,6 @@
 SMB Service
 ===========
 
-.. note:: Only the SMB3 protocol is supported.
-
 .. warning::
 
     SMB support is under active development and many features may be
@@ -26,7 +24,7 @@ Samba Containers with the following command:
 
 .. prompt:: bash #
 
-    orch apply smb <cluster_id> <config_uri> [--features ...] [--placement ...] ...
+    ceph orch apply smb <cluster_id> <config_uri> [--features ...] [--placement ...] ...
 
 There are a number of additional parameters that the command accepts. See
 the Service Specification for a description of these options.
diff --git a/doc/cephfs/administration.rst b/doc/cephfs/administration.rst
index 5760e67f73e..07646bff067 100644
--- a/doc/cephfs/administration.rst
+++ b/doc/cephfs/administration.rst
@@ -61,10 +61,17 @@ is a subset of the same information from the ``ceph fs dump`` command.
 
 ::
 
-    ceph fs set <file system name> <var> <val>
+    ceph fs set <file system name> <var> <val> [--yes-i-really-mean-it]
 
 Change a setting on a file system. These settings are specific to the named
-file system and do not affect other file systems.
+file system and do not affect other file systems. Confirmation flag is only
+needed for changing ``max_mds`` when cluster is unhealthy.
+
+.. note:: It is mandatory to pass confirmation flag (--yes--i-really-mean-it)
+   for modifying FS setting variable ``max_mds`` when cluster is unhealthy.
+   It has been added a precaution to tell users that modifying ``max_mds``
+   during troubleshooting or recovery might not help. Instead, it might
+   further destabilize the cluster.
 
 ::
 
diff --git a/doc/cephfs/cephfs-journal-tool.rst b/doc/cephfs/cephfs-journal-tool.rst
index 4ad7304481f..3ae1139ceac 100644
--- a/doc/cephfs/cephfs-journal-tool.rst
+++ b/doc/cephfs/cephfs-journal-tool.rst
@@ -105,12 +105,12 @@ Example: header get/set
       "write_pos": 4274947,
       "expire_pos": 4194304,
       "trimmed_pos": 4194303,
+      "stream_format": 1,
       "layout": { "stripe_unit": 4194304,
-          "stripe_count": 4194304,
+          "stripe_count": 1,
           "object_size": 4194304,
-          "cas_hash": 4194304,
-          "object_stripe_unit": 4194304,
-          "pg_pool": 4194304}}
+          "pool_id": 2,
+          "pool_ns": ""}}
 
     # cephfs-journal-tool header set trimmed_pos 4194303
     Updating trimmed_pos 0x400000 -> 0x3fffff
diff --git a/doc/cephfs/cephfs-mirroring.rst b/doc/cephfs/cephfs-mirroring.rst
index f54050514ed..8bdfefa1268 100644
--- a/doc/cephfs/cephfs-mirroring.rst
+++ b/doc/cephfs/cephfs-mirroring.rst
@@ -120,7 +120,9 @@ system, run a command of the following form:
 
 .. note:: "Mirroring module" commands are prefixed with ``fs snapshot mirror``.
    This distinguishes them from "monitor commands", which are prefixed with ``fs
-   mirror``. Be sure (in this context) to use module commands.
+   mirror``. Enabling mirroring by using monitor commands will result in the mirror daemon
+   entering the "failed" state due to the absence of the `cephfs_mirror` index object.
+   So be sure (in this context) to use module commands.
 
 To disable mirroring for a given file system, run a command of the following form:
 
@@ -189,6 +191,12 @@ To configure a directory for mirroring, run a command of the following form:
 
    ceph fs snapshot mirror add <fs_name> <path>
 
+To list the configured directories, run a command of the following form:
+
+.. prompt:: bash $
+
+   ceph fs snapshot mirror ls <fs_name>
+
 To stop mirroring directory snapshots, run a command of the following form:
 
 .. prompt:: bash $
@@ -340,7 +348,7 @@ command is of format `filesystem-name@filesystem-id peer-uuid`::
         "last_synced_snap": {
             "id": 120,
             "name": "snap1",
-            "sync_duration": 0.079997898999999997,
+            "sync_duration": 3,
             "sync_time_stamp": "274900.558797s",
             "sync_bytes": 52428800
         },
@@ -374,7 +382,7 @@ When a directory is currently being synchronized, the mirror daemon marks it as
         "last_synced_snap": {
             "id": 120,
             "name": "snap1",
-            "sync_duration": 0.079997898999999997,
+            "sync_duration": 3,
             "sync_time_stamp": "274900.558797s",
             "sync_bytes": 52428800
         },
@@ -403,7 +411,7 @@ E.g., adding a regular file for synchronization would result in failed status::
         "last_synced_snap": {
             "id": 121,
             "name": "snap2",
-            "sync_duration": 300,
+            "sync_duration": 5,
             "sync_time_stamp": "500900.600797s",
             "sync_bytes": 78643200
         },
@@ -439,7 +447,7 @@ In the remote filesystem::
         "last_synced_snap": {
             "id": 120,
             "name": "snap1",
-            "sync_duration": 0.079997898999999997,
+            "sync_duration": 3,
             "sync_time_stamp": "274900.558797s"
         },
         "snaps_synced": 2,
@@ -513,16 +521,16 @@ CephFS exports mirroring metrics as :ref:`Labeled Perf Counters` which will be c
      - The total number of snapshots renamed
    * - avg_sync_time
      - Gauge
-     - The average time (ms) taken by all snapshot synchronizations
+     - The average time taken by all snapshot synchronizations
    * - last_synced_start
      - Gauge
-     - The sync start time (ms) of the last synced snapshot
+     - The sync start time of the last synced snapshot
    * - last_synced_end
      - Gauge
-     - The sync end time (ms) of the last synced snapshot
+     - The sync end time of the last synced snapshot
    * - last_synced_duration
      - Gauge
-     - The time duration (ms) of the last synchronization
+     - The time duration of the last synchronization
    * - last_synced_bytes
      - counter
      - The total bytes being synchronized for the last synced snapshot
diff --git a/doc/cephfs/client-auth.rst b/doc/cephfs/client-auth.rst
index 0fe833441d3..61305e42212 100644
--- a/doc/cephfs/client-auth.rst
+++ b/doc/cephfs/client-auth.rst
@@ -106,6 +106,8 @@ If quotas are not enabled or if no quota is set on the mounted sub-directory,
 then the overall usage of the file system will be reported irrespective of the
 value of this setting.
 
+.. _cephfs-layout-and-quota-restriction:
+
 Layout and Quota restriction (the 'p' flag)
 ===========================================
 
diff --git a/doc/cephfs/file-layouts.rst b/doc/cephfs/file-layouts.rst
index 3bb6ddeae9b..306bbc6eb08 100644
--- a/doc/cephfs/file-layouts.rst
+++ b/doc/cephfs/file-layouts.rst
@@ -6,6 +6,9 @@ File layouts
 The layout of a file controls how its contents are mapped to Ceph RADOS objects.  You can
 read and write a file's layout using *virtual extended attributes* or xattrs.
 
+Clients must use the ``p`` flag when writing a file's layout. See :ref:`Layout
+and Quota restriction (the 'p' flag) <cephfs-layout-and-quota-restriction>`.
+
 The name of the layout xattrs depends on whether a file is a regular file or a directory.  Regular
 files' layout xattrs are called ``ceph.file.layout``, whereas directories' layout xattrs are called
 ``ceph.dir.layout``.  Where subsequent examples refer to ``ceph.file.layout``, substitute ``dir`` as appropriate
diff --git a/doc/cephfs/fs-volumes.rst b/doc/cephfs/fs-volumes.rst
index 4a5b5232738..3e7dd7815ad 100644
--- a/doc/cephfs/fs-volumes.rst
+++ b/doc/cephfs/fs-volumes.rst
@@ -14,12 +14,12 @@ abstractions:
 
 * FS volumes, an abstraction for CephFS file systems
 
-* FS subvolumes, an abstraction for independent CephFS directory trees
-
 * FS subvolume groups, an abstraction for a directory level higher than FS
   subvolumes. Used to effect policies (e.g., :doc:`/cephfs/file-layouts`)
   across a set of subvolumes
 
+* FS subvolumes, an abstraction for independent CephFS directory trees
+
 Possible use-cases for the export abstractions:
 
 * FS subvolumes used as Manila shares or CSI volumes
@@ -1419,5 +1419,28 @@ set with this id was present in the database
 
   $ ceph fs quiesce fs1 sub1 sub2 sub3 --set-id="external-id" --if-version=0
 
+
+.. _disabling-volumes-plugin:
+
+Disabling Volumes Plugin
+------------------------
+By default the volumes plugin is enabled and set to ``always on``. However, in
+certain cases it might be appropriate to disable it. For example, when a CephFS
+is in a degraded state, the volumes plugin commands may accumulate in MGR
+instead of getting served. Which eventually causes policy throttles to kick in
+and the MGR becomes unresponsive.
+
+In this event, volumes plugin can be disabled even though it is an
+``always on`` module in MGR. To do so, run ``ceph mgr module disable volumes
+--yes-i-really-mean-it``. Do note that this command will disable operations
+and remove commands of volumes plugin since it will disable all CephFS
+services on the Ceph cluster accessed through this plugin.
+
+Before resorting to a measure as drastic as this, it is a good idea to try less
+drastic measures and then assess if the file system experience has improved due
+to it. One example of such less drastic measure is to disable asynchronous
+threads launched by volumes plugins for cloning and purging trash.
+
+
 .. _manila: https://github.com/openstack/manila
 .. _CSI: https://github.com/ceph/ceph-csi
diff --git a/doc/cephfs/troubleshooting.rst b/doc/cephfs/troubleshooting.rst
index 34de1b7501d..3bbf15c9d38 100644
--- a/doc/cephfs/troubleshooting.rst
+++ b/doc/cephfs/troubleshooting.rst
@@ -128,6 +128,11 @@ things to do:
 
   That prevents any clients from establishing new sessions with the MDS.
 
+* **Dont tweak max_mds** Modifying the FS setting variable ``max_mds`` is
+  sometimes perceived as a good step during troubleshooting or recovery effort.
+  Instead, doing so might further destabilize the cluster. If ``max_mds`` must
+  be changed in such circumstances, run the command to change ``max_mds`` with
+  the confirmation flag (``--yes-i-really-mean-it``)
 
 
 Expediting MDS journal trim
@@ -407,6 +412,12 @@ its associated key. A less drastic but half-fix is to change the osd cap for
 your user to just ``caps osd = "allow rw"``  and delete ``tag cephfs
 data=....``
 
+Disabling the Volumes Plugin
+============================
+In certain scenarios, the Volumes plugin may need to be disabled to prevent
+compromise for rest of the Ceph cluster. For details see:
+:ref:`disabling-volumes-plugin`
+
 Reporting Issues
 ================
 
diff --git a/doc/dev/cephfs-mirroring.rst b/doc/dev/cephfs-mirroring.rst
index a804a007599..e09fed213f2 100644
--- a/doc/dev/cephfs-mirroring.rst
+++ b/doc/dev/cephfs-mirroring.rst
@@ -17,12 +17,10 @@ Key Idea
 --------
 
 For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on
-readdir diff to identify changes in a directory tree. The diffs are applied to
+`CephFS Snapdiff Feature` to identify changes in a directory tree. The diffs are applied to
 directory in the remote file system thereby only synchronizing files that have
 changed between two snapshots.
 
-This feature is tracked here: https://tracker.ceph.com/issues/47034.
-
 Currently, snapshot data is synchronized by bulk copying to the remote
 filesystem.
 
@@ -407,3 +405,5 @@ Feature Status
 --------------
 
 `cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
+
+.. _CephFS Snapdiff Feature: https://croit.io/blog/cephfs-snapdiff-feature
diff --git a/doc/dev/crimson/backfillmachine.rst b/doc/dev/crimson/backfillmachine.rst
new file mode 100644
index 00000000000..3f579621ad4
--- /dev/null
+++ b/doc/dev/crimson/backfillmachine.rst
@@ -0,0 +1,26 @@
+===============
+BackfillMachine
+===============
+
+
+In Crimson, backfill is implemented with `Boost State Chart <https://www.boost.org/doc/libs/1_86_0/libs/statechart/doc/>`_.
+
+.. //TODO: Once the implementation is settled:
+..         * Explain exceptional states once we finish working on this code
+..         * Explain example happy path flow (code walkthorugh?)
+..         * https://tracker.ceph.com/issues/68728
+
+A sample of the recent state model:
+
+.. note:: ``Cancelled`` and ``Crushed`` states are not included in the
+          following graph in order to make it easier to follow:
+
+          * **Any** state is able to transit into ``Crushed``.
+
+          * **Any** state (except from ``Initial`` and ``Waiting``) can transit into ``Cancelled``
+
+.. image:: crimson_backfillmachine.svg
+
+
+In similarly to :doc:`/dev/peering` a copy of the latest BackfillMachine
+state model can be genereated using the `gen_state_diagram.py <https://github.com/ceph/ceph/blob/master/doc/scripts/gen_state_diagram.py>`_
diff --git a/doc/dev/crimson/crimson_backfillmachine.svg b/doc/dev/crimson/crimson_backfillmachine.svg
new file mode 100644
index 00000000000..4530c2295be
--- /dev/null
+++ b/doc/dev/crimson/crimson_backfillmachine.svg
@@ -0,0 +1,135 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.44.0 (0)
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="504pt" height="233pt"
+ viewBox="0.00 0.00 504.00 232.62" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(0.65 0.65) rotate(0) translate(4 356)">
+<title>G</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-356 776,-356 776,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster0</title>
+<polygon fill="none" stroke="black" points="8,-8 8,-344 764,-344 764,-8 8,-8"/>
+<text text-anchor="middle" x="386" y="-328.8" font-family="Times-Roman" font-size="14.00">BackfillMachine</text>
+</g>
+<!-- Initial -->
+<g id="node1" class="node">
+<title>Initial</title>
+<polygon fill="lightgrey" stroke="black" points="633,-313 582.39,-295 633,-277 683.61,-295 633,-313"/>
+<polyline fill="none" stroke="black" points="593.7,-299.02 593.7,-290.98 "/>
+<polyline fill="none" stroke="black" points="621.69,-281.02 644.31,-281.02 "/>
+<polyline fill="none" stroke="black" points="672.3,-290.98 672.3,-299.02 "/>
+<polyline fill="none" stroke="black" points="644.31,-308.98 621.69,-308.98 "/>
+<text text-anchor="middle" x="633" y="-291.3" font-family="Times-Roman" font-size="14.00">Initial</text>
+</g>
+<!-- Enqueuing -->
+<g id="node2" class="node">
+<title>Enqueuing</title>
+<ellipse fill="none" stroke="black" cx="419" cy="-208" rx="59.59" ry="18"/>
+<text text-anchor="middle" x="419" y="-204.3" font-family="Times-Roman" font-size="14.00">Enqueuing</text>
+</g>
+<!-- Initial&#45;&gt;Enqueuing -->
+<g id="edge2" class="edge">
+<title>Initial&#45;&gt;Enqueuing</title>
+<path fill="none" stroke="#1e90ff" d="M610.1,-284.9C575.14,-271.02 507.83,-244.28 463.16,-226.54"/>
+<polygon fill="#1e90ff" stroke="#1e90ff" points="464.3,-223.23 453.71,-222.79 461.71,-229.73 464.3,-223.23"/>
+<text text-anchor="middle" x="561" y="-247.8" font-family="Times-Roman" font-size="14.00" fill="#1e90ff">transit</text>
+</g>
+<!-- Done -->
+<g id="node6" class="node">
+<title>Done</title>
+<ellipse fill="none" stroke="black" cx="404" cy="-34" rx="34.39" ry="18"/>
+<text text-anchor="middle" x="404" y="-30.3" font-family="Times-Roman" font-size="14.00">Done</text>
+</g>
+<!-- Initial&#45;&gt;Done -->
+<g id="edge1" class="edge">
+<title>Initial&#45;&gt;Done</title>
+<path fill="none" stroke="#000000" d="M645.22,-280.85C674.04,-248.22 740.83,-161.92 700,-103 671.6,-62.02 523.64,-44.48 448.25,-38.11"/>
+<polygon fill="#000000" stroke="#000000" points="448.37,-34.61 438.12,-37.28 447.8,-41.58 448.37,-34.61"/>
+<text text-anchor="middle" x="735" y="-160.8" font-family="Times-Roman" font-size="14.00" fill="#000000">transit</text>
+</g>
+<!-- PrimaryScanning -->
+<g id="node3" class="node">
+<title>PrimaryScanning</title>
+<ellipse fill="none" stroke="black" cx="603" cy="-121" rx="87.99" ry="18"/>
+<text text-anchor="middle" x="603" y="-117.3" font-family="Times-Roman" font-size="14.00">PrimaryScanning</text>
+</g>
+<!-- Enqueuing&#45;&gt;PrimaryScanning -->
+<g id="edge5" class="edge">
+<title>Enqueuing&#45;&gt;PrimaryScanning</title>
+<path fill="none" stroke="#ffa500" d="M450.17,-192.63C462.72,-186.59 477.22,-179.28 490,-172 500.6,-165.96 502.14,-162.57 513,-157 524.77,-150.96 537.83,-145.32 550.27,-140.4"/>
+<polygon fill="#ffa500" stroke="#ffa500" points="551.56,-143.66 559.63,-136.79 549.04,-137.12 551.56,-143.66"/>
+<text text-anchor="middle" x="600.5" y="-160.8" font-family="Times-Roman" font-size="14.00" fill="#ffa500">RequestPrimaryScanning</text>
+</g>
+<!-- ReplicasScanning -->
+<g id="node4" class="node">
+<title>ReplicasScanning</title>
+<ellipse fill="none" stroke="black" cx="278" cy="-121" rx="89.88" ry="18"/>
+<text text-anchor="middle" x="278" y="-117.3" font-family="Times-Roman" font-size="14.00">ReplicasScanning</text>
+</g>
+<!-- Enqueuing&#45;&gt;ReplicasScanning -->
+<g id="edge6" class="edge">
+<title>Enqueuing&#45;&gt;ReplicasScanning</title>
+<path fill="none" stroke="#40e0d0" d="M359.03,-207.52C291.51,-206.67 187.92,-200.56 163,-172 146.8,-153.43 167.41,-141.51 195.45,-133.98"/>
+<polygon fill="#40e0d0" stroke="#40e0d0" points="196.32,-137.37 205.2,-131.6 194.65,-130.57 196.32,-137.37"/>
+<text text-anchor="middle" x="252" y="-160.8" font-family="Times-Roman" font-size="14.00" fill="#40e0d0">RequestReplicasScanning</text>
+</g>
+<!-- Waiting -->
+<g id="node5" class="node">
+<title>Waiting</title>
+<ellipse fill="none" stroke="black" cx="93" cy="-121" rx="44.39" ry="18"/>
+<text text-anchor="middle" x="93" y="-117.3" font-family="Times-Roman" font-size="14.00">Waiting</text>
+</g>
+<!-- Enqueuing&#45;&gt;Waiting -->
+<g id="edge7" class="edge">
+<title>Enqueuing&#45;&gt;Waiting</title>
+<path fill="none" stroke="#c71585" d="M359.66,-205.76C259.29,-203.03 64.81,-194.84 44,-172 34.37,-161.43 43.93,-150.03 57.08,-140.78"/>
+<polygon fill="#c71585" stroke="#c71585" points="59.06,-143.67 65.59,-135.32 55.28,-137.77 59.06,-143.67"/>
+<text text-anchor="middle" x="98.5" y="-160.8" font-family="Times-Roman" font-size="14.00" fill="#c71585">RequestWaiting</text>
+</g>
+<!-- Enqueuing&#45;&gt;Done -->
+<g id="edge8" class="edge">
+<title>Enqueuing&#45;&gt;Done</title>
+<path fill="none" stroke="#8dff33" d="M417.49,-189.95C416.27,-176.2 414.52,-156.36 413,-139 410.73,-113.02 408.17,-83.38 406.36,-62.41"/>
+<polygon fill="#8dff33" stroke="#8dff33" points="409.83,-61.96 405.49,-52.3 402.86,-62.56 409.83,-61.96"/>
+<text text-anchor="middle" x="459.5" y="-117.3" font-family="Times-Roman" font-size="14.00" fill="#8dff33">RequestDone</text>
+</g>
+<!-- PrimaryScanning&#45;&gt;Enqueuing -->
+<g id="edge3" class="edge">
+<title>PrimaryScanning&#45;&gt;Enqueuing</title>
+<path fill="none" stroke="#ff0000" d="M539.02,-133.38C498.29,-141.09 451.78,-150.94 444,-157 436.44,-162.89 430.99,-171.69 427.15,-180.26"/>
+<polygon fill="#ff0000" stroke="#ff0000" points="423.84,-179.1 423.47,-189.69 430.36,-181.65 423.84,-179.1"/>
+<text text-anchor="middle" x="467" y="-160.8" font-family="Times-Roman" font-size="14.00" fill="#ff0000">transit</text>
+</g>
+<!-- PrimaryScanning&#45;&gt;Done -->
+<g id="edge9" class="edge">
+<title>PrimaryScanning&#45;&gt;Done</title>
+<path fill="none" stroke="#ab00d5" d="M566.5,-104.41C530.3,-88.95 475.19,-65.41 439.38,-50.11"/>
+<polygon fill="#ab00d5" stroke="#ab00d5" points="440.39,-46.74 429.82,-46.03 437.64,-53.18 440.39,-46.74"/>
+<text text-anchor="middle" x="561.5" y="-73.8" font-family="Times-Roman" font-size="14.00" fill="#ab00d5">RequestDone</text>
+</g>
+<!-- ReplicasScanning&#45;&gt;Enqueuing -->
+<g id="edge4" class="edge">
+<title>ReplicasScanning&#45;&gt;Enqueuing</title>
+<path fill="none" stroke="#0000ff" d="M310.1,-137.84C321.29,-143.6 333.84,-150.34 345,-157 349.43,-159.64 369.29,-173.07 387.32,-185.35"/>
+<polygon fill="#0000ff" stroke="#0000ff" points="385.71,-188.49 395.94,-191.23 389.65,-182.71 385.71,-188.49"/>
+<text text-anchor="middle" x="389" y="-160.8" font-family="Times-Roman" font-size="14.00" fill="#0000ff">transit</text>
+</g>
+<!-- ReplicasScanning&#45;&gt;Done -->
+<g id="edge10" class="edge">
+<title>ReplicasScanning&#45;&gt;Done</title>
+<path fill="none" stroke="#000000" d="M284.08,-102.62C288.62,-91.9 295.82,-78.61 306,-70 321.79,-56.66 343.05,-48.25 361.73,-43.03"/>
+<polygon fill="#000000" stroke="#000000" points="362.69,-46.4 371.5,-40.51 360.94,-39.62 362.69,-46.4"/>
+<text text-anchor="middle" x="352.5" y="-73.8" font-family="Times-Roman" font-size="14.00" fill="#000000">RequestDone</text>
+</g>
+<!-- Waiting&#45;&gt;Done -->
+<g id="edge11" class="edge">
+<title>Waiting&#45;&gt;Done</title>
+<path fill="none" stroke="#1e90ff" d="M117.15,-105.82C137.52,-94.47 167.79,-79.02 196,-70 251.1,-52.39 317.41,-43.19 360.1,-38.72"/>
+<polygon fill="#1e90ff" stroke="#1e90ff" points="360.56,-42.19 370.16,-37.71 359.86,-35.23 360.56,-42.19"/>
+<text text-anchor="middle" x="242.5" y="-73.8" font-family="Times-Roman" font-size="14.00" fill="#1e90ff">RequestDone</text>
+</g>
+</g>
+</svg>
diff --git a/doc/dev/crimson/index.rst b/doc/dev/crimson/index.rst
index 9790a9640c2..53864350bd7 100644
--- a/doc/dev/crimson/index.rst
+++ b/doc/dev/crimson/index.rst
@@ -7,7 +7,11 @@ Crimson developer documentation
 .. rubric:: Contents
 
 .. toctree::
-   :glob:
-
-   *
+   :maxdepth: 1
 
+   Crimson <crimson>
+   OSDState <osd>
+   The ClientRequest Pipeline <pipeline>
+   Error Handling <error-handling>
+   BackfillMachine <backfillmachine>
+   PoseidonStore <poseidonstore>
diff --git a/doc/dev/crimson/osd.rst b/doc/dev/crimson/osd.rst
index f7f132b3f9d..4e78f648f45 100644
--- a/doc/dev/crimson/osd.rst
+++ b/doc/dev/crimson/osd.rst
@@ -1,5 +1,5 @@
-osd
-===
+OSDState
+========
 
 .. graphviz::
 
diff --git a/doc/dev/developer_guide/essentials.rst b/doc/dev/developer_guide/essentials.rst
index cbde8779a66..7cce4c6f898 100644
--- a/doc/dev/developer_guide/essentials.rst
+++ b/doc/dev/developer_guide/essentials.rst
@@ -287,16 +287,13 @@ See :ref:`kubernetes-dev`
 Backporting
 -----------
 
-All bugfixes should be merged to the ``main`` branch before being
-backported. To flag a bugfix for backporting, make sure it has a
-`tracker issue`_ associated with it and set the ``Backport`` field to a
-comma-separated list of previous releases (e.g. "hammer,jewel") that you think
-need the backport.
-The rest (including the actual backporting) will be taken care of by the
-`Stable Releases and Backports`_ team.
+All bugfixes should be merged to the ``main`` branch before being backported.
+To flag a bugfix for backporting, make sure it has a `tracker issue`_
+associated with it and set the ``Backport`` field to a comma-separated list of
+previous releases (e.g. "hammer,jewel") that you think need the backport. You
+are responsible for the backporting of pull requests that you raise.
 
 .. _`tracker issue`: http://tracker.ceph.com/
-.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
 
 Dependabot
 ----------
diff --git a/doc/dev/developer_guide/index.rst b/doc/dev/developer_guide/index.rst
index e9832bea601..a27cf94b0b8 100644
--- a/doc/dev/developer_guide/index.rst
+++ b/doc/dev/developer_guide/index.rst
@@ -19,6 +19,7 @@ Contributing to Ceph: A Guide for Developers
    Tests: Unit Tests <tests-unit-tests>
    Tests: Integration Tests (Teuthology) <testing_integration_tests/index>
    Tests: Running Tests (Locally) <running-tests-locally>
+   Tests: Windows <tests-windows>
    Ceph Dashboard Developer Documentation (formerly HACKING.rst) <dash-devel>
    Tracing Developer Documentation <jaegertracing>
    Cephadm Developer Documentation  <../cephadm/index>
diff --git a/doc/dev/developer_guide/tests-windows.rst b/doc/dev/developer_guide/tests-windows.rst
new file mode 100644
index 00000000000..f347475f7f2
--- /dev/null
+++ b/doc/dev/developer_guide/tests-windows.rst
@@ -0,0 +1,143 @@
+.. _dev-testing-windows:
+
+=================
+Testing - Windows
+=================
+
+Since Pacific, the Ceph client tools and libraries can be natively used on
+Windows. This allows Windows nodes to consume Ceph without additional layers
+such as iSCSI gateways or SMB shares.
+
+A significant amount of unit tests and integration tests were ported in order
+to ensure that these components continue to function properly on Windows.
+
+Windows CI Job
+==============
+
+The `Windows CI job`_ performs the following steps for each GitHub pull request:
+
+* spin up a Linux VM in which to build the server-side (Linux) Ceph binaries
+  and cross-compile the Windows (client) binaries.
+* recreate the Linux VM and start a Ceph vstart cluster
+* boot a Windows VM and run the Ceph tests there
+
+`A small PowerShell framework`_ parallelizes the tests, aggregates the results
+and isolates or skips certain tests that are known to be flaky.
+
+The console output can contain compilation errors as well as the name of the
+tests that failed. To get the console output of the failing tests as well as
+Ceph and operating system logs, please check the build artifacts from the
+Jenkins "Status" page.
+
+.. image:: ../../images/windows_ci_status_page.png
+      :align: center
+
+The Windows CI artifacts can be downloaded as a zip archive or viewed inside
+the browser. Click the "artifacts" button to see the contents of the artifacts
+folder.
+
+.. image:: ../../images/windows_ci_artifacts.png
+      :align: center
+
+Artifact contents:
+
+* ``client/`` - Ceph client-side logs (Windows)
+    * ``eventlog/`` - Windows system logs
+    * ``logs/`` - Ceph logs
+    * ``-windows.conf`` - Ceph configuration file
+* ``cluster/`` - Ceph server-side logs (Linux)
+    * ``ceph_logs/``
+    * ``journal``
+* ``test_results/``
+    * ``out/`` - raw and xml test output grouped by the test executable
+    * ``test_results.html`` - aggregated test report (html)
+    * ``test_results.txt`` - aggregated test report (plaintext)
+
+We're using the `subunit`_ format and associated tools to aggregate the test
+results, which is especially handy when running a large amount of tests in
+parallel.
+
+The aggregated test report provides a great overview of the failing tests.
+Go to the end of the file to see the actual errors::
+
+    {0} unittest_mempool.mempool.bufferlist_reassign [0.000000s] ... ok
+    {0} unittest_mempool.mempool.bufferlist_c_str [0.006000s] ... ok
+    {0} unittest_mempool.mempool.btree_map_test [0.000000s] ... ok
+    {0} ceph_test_dokan.DokanTests.test_mount [9.203000s] ... FAILED
+
+    Captured details:
+    ~~~~~~~~~~~~~~~~~
+        b'/home/ubuntu/ceph/src/test/dokan/dokan.cc:136'
+        b'Expected equality of these values:'
+        b'  wait_for_mount(mountpoint)'
+        b'    Which is: -138'
+        b'  0'
+        b''
+        b'/home/ubuntu/ceph/src/test/dokan/dokan.cc:208'
+        b'Expected equality of these values:'
+        b'  ret'
+        b'    Which is: "ceph-dokan: exit status: -22"'
+        b'  ""'
+        b'Failed unmapping: Y:\\'
+    {0} ceph_test_dokan.DokanTests.test_mount_read_only [9.140000s] ... FAILED
+
+The html report conveniently groups the test results by test suite (test binary).
+For security reasons it isn't rendered by default but it can be downloaded and
+viewed locally:
+
+.. image:: ../../images/windows_ci_html_report.png
+      :align: center
+
+Timeouts and missing test results are often an indication that a process crashed.
+Note that the ceph status is printed out on the console before and after
+performing the tests, which can help identify crashed services.
+
+You may also want to check the service logs (both client and server side). Also,
+be aware that the Windows "application" event log will contain entries in case
+of crashed Windows processes.
+
+Frequently asked questions
+==========================
+
+1. Why is the Windows CI job the only one that fails on my PR?
+
+Ceph integration tests are normally performed through Teuthology on the Ceph
+Lab infrastructure. These tests are triggered on-demand by the Ceph QA
+team and do not run automatically for every submitted pull request.
+
+Since the Windows CI job focuses only on the client-side Ceph components,
+it can run various integration tests in a timely manner for every pull request
+on GitHub. **In other words, it runs various librados, librbd and libcephfs
+tests that other checks such as "make check" do not.**
+
+For this reason, the Windows CI often catches regressions that are missed by the
+other checks and would otherwise only come up through Teuthology. More often
+than not, these regressions are not platform-specific and affect Linux as well.
+
+In case of Windows CI failures, we strongly suggest checking the test results
+as described above.
+
+Be aware that the `Windows build script`_ may use different compilation flags
+and ``-D`` options passed to CMake. For example, it defaults to ``Release`` mode
+instead of ``Debug`` mode. At the same time, it uses a different toolchain
+(``mingw-llvm``) and a separate set of `dependencies`_, make sure to bump the
+versions if needed.
+
+2. Why is the Windows CI job mandatory?
+
+The test job was initially optional, as a result regressions were introduced
+very often.
+
+After a time, Windows support became mature enough to make this CI job mandatory.
+This significantly reduces the amount of work required to address regressions
+and assures Ceph users of continued Windows support.
+
+As said before, another great advantage is that it runs integration tests that
+quickly catch regressions which often affect Linux builds as well. This spares
+developers from having to wait for the full Teuthology results.
+
+.. _Windows CI job: https://github.com/ceph/ceph-build/blob/main/ceph-windows-pull-requests/config/definitions/ceph-windows-pull-requests.yml
+.. _A small PowerShell framework: https://github.com/ceph/ceph-win32-tests/
+.. _Windows build script: https://github.com/ceph/ceph/blob/main/win32_build.sh
+.. _dependencies: https://github.com/ceph/ceph/blob/main/win32_deps_build.sh
+.. _subunit: https://github.com/testing-cabal/subunit
+\ No newline at end of file
diff --git a/doc/dev/kclient.rst b/doc/dev/kclient.rst
new file mode 100644
index 00000000000..fd4903ac1ab
--- /dev/null
+++ b/doc/dev/kclient.rst
@@ -0,0 +1,478 @@
+Testing changes to the Linux Kernel CephFS driver
+=================================================
+
+This walkthrough will explain one (opinionated) way to do testing of the Linux
+kernel client against a development cluster. We will try to mimimize any
+assumptions about pre-existing knowledge of how to do kernel builds or any
+related best-practices.
+
+.. note:: There are many completely valid ways to do kernel development for
+          Ceph. This guide is a walkthrough of the author's own environment.
+          You may decide to do things very differently.
+
+Step One: build the kernel
+==========================
+
+Clone the kernel:
+
+.. code-block:: bash
+
+    git init linux && cd linux
+    git remote add torvalds git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+    git remote add ceph https://github.com/ceph/ceph-client.git
+    git fetch && git checkout torvalds/master
+
+
+Configure the kernel:
+
+.. code-block:: bash
+
+    make defconfig
+
+.. note:: You can alternatively use the `Ceph Kernel QA Config`_ for building the kernel.
+
+We now have a kernel config with reasonable defaults for the architecture you're
+building on. The next thing to do is to enable configs which will build Ceph and/or
+provide functionality we need to do testing.
+
+.. code-block:: bash
+
+    cat > ~/.ceph.config <<EOF
+    CONFIG_CEPH_FS=y
+    CONFIG_CEPH_FSCACHE=y
+    CONFIG_CEPH_FS_POSIX_ACL=y
+    CONFIG_CEPH_FS_SECURITY_LABEL=y
+    CONFIG_CEPH_LIB_PRETTYDEBUG=y
+    CONFIG_DYNAMIC_DEBUG=y
+    CONFIG_DYNAMIC_DEBUG_CORE=y
+    CONFIG_FRAME_POINTER=y
+    CONFIG_FSCACHE
+    CONFIG_FSCACHE_STATS
+    CONFIG_FS_ENCRYPTION=y
+    CONFIG_FS_ENCRYPTION_ALGS=y
+    CONFIG_KGDB=y
+    CONFIG_KGDB_SERIAL_CONSOLE=y
+    CONFIG_XFS_FS=y
+    EOF
+
+Beyond enabling Ceph-related configs, we are also enabling some useful
+debug configs and XFS (as an alternative to ext4 if needed for our root file
+system).
+
+.. note:: It is a good idea to not build anything as a kernel module. Otherwise, you would need to ``make modules_install`` on the root drive of the VM.
+
+Now, merge the configs.
+
+
+.. code-block:: bash
+
+
+    scripts/kconfig/merge_config.sh .config ~/.ceph.config
+
+
+Finally, build the kernel:
+
+.. code-block:: bash
+
+    make -j
+
+
+.. note:: This document does not discuss how to get relevant utilities for your
+          distribution to actually build the kernel, like gcc. Please use your search
+          engine of choice to learn how to do that.
+
+
+Step Two: create a VM
+=====================
+
+A virtual machine is a good choice for testing the kernel client for a few reasons:
+
+* You can more easily monitor and configure networking for the VM.
+* You can very rapidly test a change to the kernel (build -> mount in less than 10 seconds).
+* A fault in the kernel won't crash your machine.
+* You have a suite of tools available for analysis on the running kernel.
+
+The main decision for you to make is what Linux distribution you want to use.
+This document uses Arch Linux due to the author's familiarity. We also use LVM
+to create a volume. You may use partitions or whatever mechanism you like to
+create a block device. In general, this block device will be used repeatedly in
+testing. You may want to use snapshots to avoid a VM somehow corrupting your
+root disk and forcing you to start over.
+
+
+.. code-block:: bash
+
+    # create a volume
+    VOLUME_GROUP=foo
+    sudo lvcreate -L 256G "$VOLUME_GROUP" -n $(whoami)-vm-0
+    DEV="/dev/${VOLUME_GROUP}/$(whoami)-vm-0"
+    sudo mkfs.xfs "$DEV"
+    sudo mount "$DEV" /mnt
+    sudo pacstrap /mnt base base-devel vim less jq
+    sudo arch-chroot /mnt
+    # # delete root's password for ease of login
+    # passwd -d root
+    # mkdir -p /root/.ssh && echo "$YOUR_SSH_KEY_PUBKEY" >> /root/.ssh/authorized_keys
+    # exit
+    sudo umount /mnt
+
+Once that's done, we should be able to run a VM:
+
+
+.. code-block:: bash
+
+    qemu-system-x86_64 -enable-kvm -kernel $(pwd)/arch/x86/boot/bzImage -drive file="$DEV",if=virtio,format=raw -append 'root=/dev/vda rw'
+
+You should see output like:
+
+::
+
+    VNC server running on ::1:5900
+
+You could view that console using:
+
+
+.. code-block:: bash
+
+    vncviewer 127.0.0.1:5900
+
+Congratulations, you have a VM running the kernel that you just built.
+
+
+Step Three: Networking the VM
+=============================
+
+This is the "hard part" and requires the most customization depending on what
+you want to do. For this author, I currently have a development setup like:
+
+
+::
+
+     sepian netns
+    ______________
+   |              |
+   | kernel VM    |              sepia-bounce VM      vossi04.front.sepia.ceph.com
+   |  -------  |  |                  ------                    -------
+   |  |     |  |  | 192.168.20.1     |    |                    |     |
+   |  |     |--|--|- <- wireguard -> |    |  <-- sepia vpn ->  |     |
+   |  |_____|  |  |     192.168.20.2 |____|                    |_____|
+   |          br0 |
+   |______________|
+
+
+The sepia-bounce VM is used as a bounce box to the sepia lab. It can proxy ssh
+connections, route any sepia-bound traffic, or serve as a DNS proxy. The use of
+a sepia-bounce VM is optional but can be useful, especially if you want to
+create numerous kernel VMs for testing.
+
+I like to use the vossi04 `developer playground`_ to build Ceph and setup a
+vstart cluster.  It has sufficient resources to make building Ceph very fast
+(~5 minutes cold build) and local disk resources to run a decent vstart
+cluster.
+
+To avoid overcomplicating this document with the details of the sepia-bounce
+VM, I will note the following main configurations used for the purpose of
+testing the kernel:
+
+- setup a wireguard tunnel between the machine creating kernel VMs and the sepia-bounce VM
+- use ``systemd-resolved`` as a DNS resolver and listen on 192.168.20.2 (instead of just localhost)
+- connect to the sepia `VPN`_ and use `systemd resolved update script`_ to configure ``systemd-resolved`` to use the DNS servers acquired via DHCP from the sepia VPN
+- configure ``firewalld`` to allow wireguard traffic and to masquerade and forward traffic to the sepia vpn
+
+The next task is to connect the kernel VM to the sepia-bounce VM. A network
+namespace can be useful for this purpose to isolate traffic / routing rules for
+the VMs. For me, I orchestrate this using a custom systemd one-shot unit that
+looks like:
+
+::
+
+    # create the net namespace
+    ExecStart=/usr/bin/ip netns add sepian
+    # bring lo up
+    ExecStart=/usr/bin/ip netns exec sepian ip link set dev lo up
+    # setup wireguard to sepia-bounce
+    ExecStart=/usr/bin/ip link add wg-sepian type wireguard
+    ExecStart=/usr/bin/wg setconf wg-sepian /etc/wireguard/wg-sepian.conf
+    # move the wireguard interface to the sepian nents
+    ExecStart=/usr/bin/ip link set wg-sepian netns sepian
+    # configure the static ip and bring it up
+    ExecStart=/usr/bin/ip netns exec sepian ip addr add 192.168.20.1/24 dev wg-sepian
+    ExecStart=/usr/bin/ip netns exec sepian ip link set wg-sepian up
+    # logging info
+    ExecStart=/usr/bin/ip netns exec sepian ip addr
+    ExecStart=/usr/bin/ip netns exec sepian ip route
+    # make wireguard the default route
+    ExecStart=/usr/bin/ip netns exec sepian ip route add default via 192.168.20.2 dev wg-sepian
+    # more logging
+    ExecStart=/usr/bin/ip netns exec sepian ip route
+    # add a bridge interface for VMs
+    ExecStart=/usr/bin/ip netns exec sepian ip link add name br0 type bridge
+    # configure the addresses and bring it up
+    ExecStart=/usr/bin/ip netns exec sepian ip addr add 192.168.0.1/24 dev br0
+    ExecStart=/usr/bin/ip netns exec sepian ip link set br0 up
+    # masquerade/forward traffic to sepia-bounce
+    ExecStart=/usr/bin/ip netns exec sepian iptables -t nat -A POSTROUTING -o wg-sepian -j MASQUERADE
+
+
+When using the network namespace, we will use ``ip netns exec``. There is a
+handy feature to automatically bind mount files into the ``/etc`` namespace for
+commands run via that command:
+
+::
+
+    # cat /etc/netns/sepian/resolv.conf
+    nameserver 192.168.20.2
+
+That file will configure the libc name resolution stack to route DNS requests
+for applications to the ``systemd-resolved`` daemon running on sepia-bounce.
+Consequently, any application running in that netns will be able to resolve
+sepia hostnames:
+
+::
+
+    $ sudo ip netns exec sepian host vossi04.front.sepia.ceph.com
+    vossi04.front.sepia.ceph.com has address 172.21.10.4
+
+
+Okay, great. We have a network namespace that forwards traffic to the sepia
+VPN.  The next mental step is to connect virtual machines running a kernel to
+the bridge we have configured. The straightforward way to do that is to create
+a "tap" device which connects to the bridge:
+
+.. code-block:: bash
+
+     sudo ip netns exec sepian qemu-system-x86_64 \
+         -enable-kvm \
+         -kernel $(pwd)/arch/x86/boot/bzImage \
+         -drive file="$DEV",if=virtio,format=raw \
+         -netdev tap,id=net0,ifname=tap0,script="$HOME/bin/qemu-br0",downscript=no \
+         -device virtio-net-pci,netdev=net0 \
+         -append 'root=/dev/vda rw'
+
+The new relevant bits here are (a) executing the VM in the netns we have
+constructed; (b) a ``-netdev``  command to configure a tap device; (c) a
+virtual network card for the VM. There is also a script ``$HOME/bin/qemu-br0``
+run by qemu to configure the tap device it creates for the VM:
+
+::
+
+    #!/bin/bash
+    tap=$1
+    ip link set "$tap" master br0
+    ip link set dev "$tap" up
+
+That simply plugs the new tap device into the bridge.
+
+This is all well and good but we are now missing one last crucial step. What is
+the IP address of the VM?  There are two options:
+
+1. configure a static IP but the VM's root device networking stack
+   configuration must be modified
+2. use DHCP and configure the root device for VMs to always use dhcp to
+   configure their ethernet device addresses
+
+The second option is more complicated to setup, since you must run a DHCP
+server now, but provides the greatest flexibility for adding more VMs as needed
+when testing.
+
+The modified (or "hacked") standard dhcpd systemd service looks like:
+
+::
+
+    # cat sepian-dhcpd.service
+    [Unit]
+    Description=IPv4 DHCP server
+    After=network.target network-online.target sepian-netns.service
+    Wants=network-online.target
+    Requires=sepian-netns.service
+    
+    [Service]
+    ExecStartPre=/usr/bin/touch /tmp/dhcpd.leases
+    ExecStartPre=/usr/bin/cat /etc/netns/sepian/dhcpd.conf
+    ExecStart=/usr/bin/dhcpd -f -4 -q -cf /etc/netns/sepian/dhcpd.conf -lf /tmp/dhcpd.leases
+    NetworkNamespacePath=/var/run/netns/sepian
+    RuntimeDirectory=dhcpd4
+    User=dhcp
+    AmbientCapabilities=CAP_NET_BIND_SERVICE CAP_NET_RAW
+    ProtectSystem=full
+    ProtectHome=on
+    KillSignal=SIGINT
+    # We pull in network-online.target for a configured network connection.
+    # However this is not guaranteed to be the network connection our
+    # networks are configured for. So try to restart on failure with a delay
+    # of two seconds. Rate limiting kicks in after 12 seconds.
+    RestartSec=2s
+    Restart=on-failure
+    StartLimitInterval=12s
+    
+    [Install]
+    WantedBy=multi-user.target
+
+Similarly, the referenced dhcpd.conf:
+
+::
+
+    # cat /etc/netns/sepian/dhcpd.conf
+    option domain-name-servers 192.168.20.2;
+    option subnet-mask 255.255.255.0;
+    option routers 192.168.0.1;
+    subnet 192.168.0.0 netmask 255.255.255.0 {
+        range 192.168.0.100 192.168.0.199;
+    }
+
+Importantly, this tells the VM to route traffic to 192.168.0.1 (the IP of the
+bridge in the netns) and DNS can be provided by 192.168.20.2 (via
+``systemd-resolved`` on the sepia-bounce VM).
+
+In the VM, the networking looks like:
+
+::
+
+	[root@archlinux ~]# ip link
+	1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000
+    	link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+	2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP mode DEFAULT group default qlen 1000
+    	link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
+	3: sit0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN mode DEFAULT group default qlen 1000
+    	link/sit 0.0.0.0 brd 0.0.0.0
+	[root@archlinux ~]# ip addr
+	1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
+    	link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
+    	inet 127.0.0.1/8 scope host lo
+       	valid_lft forever preferred_lft forever
+    	inet6 ::1/128 scope host noprefixroute 
+       	valid_lft forever preferred_lft forever
+	2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
+    	link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
+    	inet 192.168.0.100/24 metric 1024 brd 192.168.0.255 scope global dynamic enp0s3
+       	valid_lft 28435sec preferred_lft 28435sec
+    	inet6 fe80::5054:ff:fe12:3456/64 scope link proto kernel_ll 
+       	valid_lft forever preferred_lft forever
+	3: sit0@NONE: <NOARP> mtu 1480 qdisc noop state DOWN group default qlen 1000
+    	link/sit 0.0.0.0 brd 0.0.0.0
+	[root@archlinux ~]# systemd-resolve --status
+	Global
+           	Protocols: +LLMNR +mDNS -DNSOverTLS DNSSEC=no/unsupported
+    	resolv.conf mode: stub
+	Fallback DNS Servers: 1.1.1.1#cloudflare-dns.com 9.9.9.9#dns.quad9.net 8.8.8.8#dns.google 2606:4700:4700::1111#cloudflare-dns.com 2620:fe::9#dns.quad9.net 2001:4860:4860::8888#dns.google
+	
+	Link 2 (enp0s3)
+    	Current Scopes: DNS LLMNR/IPv4 LLMNR/IPv6
+         	Protocols: +DefaultRoute +LLMNR -mDNS -DNSOverTLS DNSSEC=no/unsupported
+	Current DNS Server: 192.168.20.2
+       	DNS Servers: 192.168.20.2
+	
+	Link 3 (sit0)
+    	Current Scopes: none
+         	Protocols: -DefaultRoute +LLMNR +mDNS -DNSOverTLS DNSSEC=no/unsupported
+
+
+Finally, some other networking configurations to consider:
+
+* Run the VM on your machine with full access to the host networking stack. If you have the sepia vpn, this will probably work without too much configuration.
+* Run the VM in a netns as above but also setup the sepia vpn in the same netns. This can help to avoid using a sepia-bounce VM. You'll still need to configure routing between the bridge and the sepia VPN.
+* Run the VM in a netns as above but only use a local vstart cluster (possibly in another VM) in the same netns.
+
+
+Step Four: mounting a CephFS file system in your VM
+---------------------------------------------------
+
+This guide uses a vstart cluster on a machine in the sepia lab. Because the mon
+addresses will change with any new vstart cluster, it will invalidate any
+static configuration we may setup for our VM mounting the CephFS via the kernel
+driver.  So, we should create a script to fetch the configuration for our
+vstart cluster prior to mounting:
+
+.. code-block:: bash
+
+    #!/bin/bash
+    # kmount.sh -- mount a vstart Ceph cluster on a remote machine
+    
+    # the cephx client credential, vstart creates "client.fs" by default
+    NAME=fs
+    # static fs name, vstart creates an "a" file system by default
+    FS=a
+    # where to mount on the VM
+    MOUNTPOINT=/mnt
+    # cephfs mount point (root by default)
+    CEPHFS_MOUNTPOINT=/
+    
+    function run {
+        printf '%s\n' "$*" >&2
+        "$@"
+    }
+    
+    function mssh {
+        run ssh vossi04.front.sepia.ceph.com "cd ceph/build && (source vstart_environment.sh; $1)"
+    }
+    
+    # create the minimum config (including mon addresses) and store it in the VM's ceph.conf. This is not used for mounting; we're storing it for potential use with `ceph` commands.
+    mssh "ceph config generate-minimal-conf" > /etc/ceph/ceph.conf
+    # get the vstart cluster's fsid
+    FSID=$(mssh "ceph fsid")
+    # get the auth key associated with client.fs
+    KEY=$(mssh "ceph auth get-key client.$NAME")
+    # dump the v2 mon addresses and format for the -o mon_addr mount option
+    MONS=$(mssh "ceph mon dump --format=json" | jq -r '.mons[] | .public_addrs.addrvec[] | select(.type == "v2") | .addr' | paste -s -d/)
+    
+    # turn on kernel debugging (and any other debugging you'd like)
+    echo "module ceph +p" | tee /sys/kernel/debug/dynamic_debug/control
+    # do the mount! we use the new device syntax for this mount
+    run mount -t ceph "${NAME}@${FSID}.${FS}=${CEPHFS_MOUNTPOINT}" -o "mon_addr=${MONS},ms_mode=crc,name=${NAME},secret=${KEY},norequire_active_mds,noshare" "$MOUNTPOINT"
+
+That would be run like:
+
+.. code-block:: bash
+
+    $ sudo ip netns exec sepian ssh root@192.168.0.100 ./kmount.sh
+    ...
+    mount -t ceph fs@c9653bca-110b-4f70-9f84-5a195b205e9a.a=/ -o mon_addr=172.21.10.4:40762/172.21.10.4:40764/172.21.10.4:40766,ms_mode=crc,name=fs,secret=AQD0jgln43pBCxAA7cJlZ4Px7J0UmiK4A4j3rA==,norequire_active_mds,noshare /mnt
+    $ sudo ip netns exec sepian ssh root@192.168.0.100 df -h /mnt
+    Filesystem                                   Size  Used Avail Use% Mounted on
+    fs@c9653bca-110b-4f70-9f84-5a195b205e9a.a=/  169G     0  169G   0% /mnt
+
+
+If you run into difficulties, it may be:
+
+* The firewall on the node running the vstart cluster is blocking your connections.
+* Some misconfiguration in your networking stack.
+* An incorrect configuration for the mount.
+
+
+Step Five: testing kernel changes in teuthology
+-----------------------------------------------
+
+There 3 static branches in the `ceph kernel git repository`_ managed by the Ceph team:
+
+* `for-linus <https://github.com/ceph/ceph-client/tree/for-linus>`_: A branch managed by the primary Ceph maintainer to share changes with Linus Torvalds (upstream). Do not push to this branch.
+* `master <https://github.com/ceph/ceph-client/tree/master>`_: A staging ground for patches planned to be sent to Linus. Do not push to this branch. 
+* `testing <https://github.com/ceph/ceph-client/tree/testing>`_ A staging ground for miscellaneous patches that need wider QA testing (via nightlies or regular Ceph QA testing). Push patches you believe to be nearly ready for upstream acceptance.
+
+You may also push a ``wip-$feature`` branch to the ``ceph-client.git``
+repository which will be built by Jenkins. Then view the results of the build
+in `Shaman <https://shaman.ceph.com/builds/kernel/>`_.
+
+Once a kernel branch is built, you can test it via the ``fs`` CephFS QA suite:
+
+.. code-block:: bash
+
+    $ teuthology-suite ... --suite fs --kernel wip-$feature --filter k-testing
+
+
+The ``k-testing`` filter is looking for the fragment which normally sets
+``testing`` branch of the kernel for routine QA. That is, the ``fs`` suite
+regularly runs tests against whatever is in the ``testing`` branch of the
+kernel. We are overriding that choice of kernel branch via the ``--kernel
+wip-$featuree`` switch.
+
+.. note:: Without filtering for ``k-testing``, the ``fs`` suite will also run jobs using ceph-fuse or stock kernel, libcephfs tests, and other tests that may not be of interest to you when evaluating changes to the kernel.
+
+The actual override is controlled using Lua merge scripts in the
+``k-testing.yaml`` fragment. See that file for more details.
+
+
+.. _VPN: https://wiki.sepia.ceph.com/doku.php?id=vpnaccess
+.. _systemd resolved update script: systemd-resolved: https://wiki.archlinux.org/title/Systemd-resolved
+.. _Ceph Kernel QA Config: https://github.com/ceph/ceph-build/tree/899d0848a0f487f7e4cee773556aaf9529b8db26/kernel/build
+.. _developer playground: https://wiki.sepia.ceph.com/doku.php?id=devplayground#developer_playgrounds
+.. _ceph kernel git repository: https://github.com/ceph/ceph-client
diff --git a/doc/dev/radosgw/bucket_index.rst b/doc/dev/radosgw/bucket_index.rst
index 6764641e0f5..ceff57b58cf 100644
--- a/doc/dev/radosgw/bucket_index.rst
+++ b/doc/dev/radosgw/bucket_index.rst
@@ -32,7 +32,7 @@ For a given bucket, the index may be split into several rados objects, called bu
 
 The default shard count for new buckets is 11, but can be overridden in the zonegroup's ``bucket_index_max_shards`` or ceph.conf's ``rgw_override_bucket_index_max_shards``. As the number of objects in a bucket grows, its index shard count will also increase as a result of dynamic resharding.
 
-Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/rgw_reshard.cc``.
+Information about the bucket's index object layout is stored in ``RGWBucketInfo`` as ``struct rgw::BucketLayout`` from ``src/rgw/rgw_bucket_layout.h``. The resharding logic is in ``src/rgw/driver/rados/rgw_reshard.cc``.
 
 -----------------
 Index Transaction
@@ -46,7 +46,7 @@ To keep the bucket index consistent, all object writes or deletes must also upda
 
 Object writes and deletes may race with each other, so a given object may have more than one prepared transaction at a time. RGW considers an object entry to be 'pending' if there are any outstanding transactions, or 'completed' otherwise.
 
-This transaction is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
+This transaction is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Object::Write::write_meta()`` for object writes, and ``RGWRados::Object::Delete::delete_obj()`` for object deletes. The bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_prepare_op()`` and ``rgw_bucket_complete_op()``.
 
 -------
 Listing
@@ -56,7 +56,7 @@ When listing objects, RGW will read all entries (pending and completed) from the
 
 If an RGW crashes in the middle of an `Index Transaction`_, an index entry may get stuck in this 'pending' state. When bucket listing encounters these pending entries, it also sends information from the head object back to the bucket index so it can update the entry and resolve its stale transactions. This message is called 'dir suggest', because the bucket index treats it as a hint or suggestion.
 
-Bucket listing is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
+Bucket listing is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::Bucket::List::list_objects_ordered()`` and ``RGWRados::Bucket::List::list_objects_unordered()``. ``RGWRados::check_disk_state()`` is the part that reads the head object and encodes suggested changes. The corresponding bucket index operations are implemented in ``src/cls/rgw/cls_rgw.cc`` as ``rgw_bucket_list()`` and ``rgw_dir_suggest_changes()``.
 
 --------------------
 S3 Object Versioning
@@ -66,9 +66,9 @@ For versioned buckets, the bucket index contains an entry for each object versio
 
 RGW stores a head object in the rgw.buckets.data pool for each object version. This rados object's oid is a combination of the object name and its version id.
 
-In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
+In S3, a GET/HEAD request for an object name will give you that object's "current" version. To support this, RGW stores an extra 'object logical head' (olh) object whose oid includes the object name only, that acts as an indirection to the head object of its current version. This indirection logic is implemented in ``src/rgw/driver/rados/rgw_rados.cc`` as ``RGWRados::follow_olh()``.
 
-To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
+To maintain the consistency between this olh object and the bucket index, the index keeps a separate 'olh' entry for each object name. This entry stores a log of all writes/deletes to its versions. In ``src/rgw/driver/rados/rgw_rados.cc``, ``RGWRados::apply_olh_log()`` replays this log to guarantee that this olh object converges on the same "current" version as the bucket index.
 
 .. _ListObjectsV2: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjects.html
 .. _ListObjectVersions: https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectVersions.html
diff --git a/doc/glossary.rst b/doc/glossary.rst
index d3a0dd8f4cb..2fcef377204 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -402,7 +402,15 @@
                 placement group, and each placement group belongs to exactly
                 one Ceph pool. 
 
+        PLP 
+                **P**\ower **L**\oss **P**\rotection. A technology that
+                protects the data of solid-state drives by using capacitors to
+                extend the amount of time available for transferring data from
+                the DRAM cache to the SSD's permanent memory. Consumer-grade
+                SSDs are rarely equipped with PLP.
+
 	:ref:`Pool<rados_pools>`
+
 		A pool is a logical partition used to store objects.
 
 	Pools
diff --git a/doc/governance.rst b/doc/governance.rst
index 95e1c878028..bc88560f18a 100644
--- a/doc/governance.rst
+++ b/doc/governance.rst
@@ -29,7 +29,7 @@ Responsibilities
  * Point of contact for the project
  * Representatives for Ceph foundation board meetings
  * Ensure things get done
-
+   
 Membership
 ----------
 
@@ -82,7 +82,7 @@ Current Members
  * Casey Bodley <cbodley@redhat.com>
  * Dan van der Ster <dan.vanderster@clyso.com>
  * David Orman <ormandj@1111systems.com>
- * Ernesto Puerta <epuerta@redhat.com>
+ * Ernesto Puerta <epuertat@redhat.com>
  * Gregory Farnum <gfarnum@redhat.com>
  * Haomai Wang <haomai@xsky.com>
  * Ilya Dryomov <idryomov@redhat.com>
@@ -96,14 +96,25 @@ Current Members
  * Mike Perez <miperez@redhat.com>
  * Myoungwon Oh <myoungwon.oh@samsung.com>
  * Neha Ojha <nojha@redhat.com>
- * Patrick Donnelly <pdonnell@redhat.com>
+ * Patrick Donnelly <pdonnell@ibm.com>
  * Sam Just <sjust@redhat.com>
  * Vikhyat Umrao <vikhyat@redhat.com>
  * Xie Xingguo <xie.xingguo@zte.com.cn>
  * Yehuda Sadeh <yehuda@redhat.com>
  * Yingxin Cheng <yingxin.cheng@intel.com>
  * Yuri Weinstein <yweinste@redhat.com>
- * Zac Dover <zac.dover@gmail.com>
+ * Zac Dover <zac.dover@proton.me>
+ * Laura Flores <lflores@redhat.com>
+ * Venky Shankar <vshankar@redhat.com>
+ * Guillaume Abrioux <gabrioux@redhat.com>
+ * Anthony D'Atri <anthony.datri@gmail.com>
+ * Joseph Mundackal <jmundackal@bloomberg.net>
+ * Gaurav Sitlani <gsitlani@ibm.com>
+ * Afreen Misbah <afreen@ibm.com>
+ * Radoslaw Zarzynski <rzarzyns@redhat.com>
+ * Matan Breizman <mbreizma@redhat.com>
+ * Yaarit Hatuka <yhatuka@ibm.com>
+ * Adam C. Emerson <aemerson@redhat.com>
 
 .. _ctl:
 
diff --git a/doc/images/windows_ci_artifacts.png b/doc/images/windows_ci_artifacts.png
new file mode 100644
index 00000000000..813ad7efbf3
--- /dev/null
+++ b/doc/images/windows_ci_artifacts.png
diff --git a/doc/images/windows_ci_html_report.png b/doc/images/windows_ci_html_report.png
new file mode 100644
index 00000000000..21b76eabcd7
--- /dev/null
+++ b/doc/images/windows_ci_html_report.png
diff --git a/doc/images/windows_ci_status_page.png b/doc/images/windows_ci_status_page.png
new file mode 100644
index 00000000000..e689f5d7f2b
--- /dev/null
+++ b/doc/images/windows_ci_status_page.png
diff --git a/doc/install/windows-install.rst b/doc/install/windows-install.rst
index 6da3e17231a..7cc99472c0b 100644
--- a/doc/install/windows-install.rst
+++ b/doc/install/windows-install.rst
@@ -85,3 +85,4 @@ Further reading
 .. _Windows troubleshooting: ../windows-troubleshooting
 .. _General CephFS Prerequisites: ../../cephfs/mount-prerequisites
 .. _Client Authentication: ../../cephfs/client-auth
+.. _Windows testing: ../dev/tests-windows
diff --git a/doc/man/8/mount.ceph.rst b/doc/man/8/mount.ceph.rst
index 7ecdeb5e852..553e190bdac 100644
--- a/doc/man/8/mount.ceph.rst
+++ b/doc/man/8/mount.ceph.rst
@@ -192,12 +192,13 @@ Advanced
 :command:`wsync`
     Execute all namespace operations synchronously. This ensures that the
     namespace operation will only complete after receiving a reply from
-    the MDS. This is the default.
+    the MDS. 
 
 :command:`nowsync`
     Allow the client to do namespace operations asynchronously. When this
     option is enabled, a namespace operation may complete before the MDS
-    replies, if it has sufficient capabilities to do so.
+    replies, if it has sufficient capabilities to do so. This has been the
+    default since kernel version 5.16.
 
 :command:`crush_location=x`
     Specify the location of the client in terms of CRUSH hierarchy (since 5.8).
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index 4039e78fad3..492dad652d2 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -532,7 +532,7 @@ Commands
   disabled on all images (within the pool or namespace) for which mirroring
   was enabled, whether by default or explicitly.
 
-:command:`mirror pool enable` [*pool-name*] *mode*
+:command:`mirror pool enable` *pool-name* *mode* [--remote-namespace *remote-namespace-name*]
   Enable RBD mirroring within a pool or namespace.
   The mirroring mode can either be ``pool`` or ``image``.
   If configured in ``pool`` mode, all images in the pool or namespace
@@ -540,6 +540,8 @@ Commands
   If configured in ``image`` mode, mirroring needs to be
   explicitly enabled (by ``mirror image enable`` command)
   on each image.
+  A namespace can be mirrored to a different namespace on the remote
+  pool using the ``--remote-namespace`` option.
 
 :command:`mirror pool info` [*pool-name*]
   Show information about the pool or namespace mirroring configuration.
diff --git a/doc/mgr/smb.rst b/doc/mgr/smb.rst
index 05e6369ddf1..3252c485a9a 100644
--- a/doc/mgr/smb.rst
+++ b/doc/mgr/smb.rst
@@ -96,6 +96,11 @@ clustering
     enables clustering regardless of the placement count. A value of ``never``
     disables clustering regardless of the placement count. If unspecified,
     ``default`` is assumed.
+public_addrs
+    Optional. A string in the form of <ipaddress/prefixlength>[%<destination interface>].
+    Supported only when using Samba's clustering. Assign "virtual" IP
+    addresses that will be managed by the clustering subsystem and may automatically
+    move between nodes running Samba containers.
 
 Remove Cluster
 ++++++++++++++
diff --git a/doc/monitoring/index.rst b/doc/monitoring/index.rst
index 794fdf84195..afccd9ab16a 100644
--- a/doc/monitoring/index.rst
+++ b/doc/monitoring/index.rst
@@ -64,6 +64,30 @@ in:
 
 It is good to outline that the main tool allowing users to observe and monitor a Ceph cluster is the **Ceph dashboard**. It provides graphics where the most important cluster and service metrics are represented. Most of the examples in this document are extracted from the dashboard graphics or extrapolated from the metrics exposed by the Ceph dashboard.
 
+Ceph daemon health metrics
+==========================
+
+The Ceph exporter provides a metric called ``ceph_daemon_socket_up`` that reports the liveness status of each Ceph daemon that exposes an admin socket.
+
+The ``ceph_daemon_socket_up`` metric indicates the health status of a Ceph daemon based on its ability to respond via the admin socket, where a value of ``1`` means healthy, and ``0`` means unhealthy. Although a Ceph daemon might still be "alive" when it reports ``ceph_daemon_socket_up=0``, this situation highlights a significant issue in its functionality. As such, this metric serves as an excellent tool for detecting problems in any of the main Ceph daemons.
+
+Labels:
+- **``ceph_daemon``**: Identifier of the Ceph daemon exposing an admin socket on the host.
+- **``hostname``**: Name of the host where the Ceph daemon is running.
+
+Example:
+
+.. code-block:: bash
+
+   ceph_daemon_socket_up{ceph_daemon="mds.a",hostname="testhost"} 1
+   ceph_daemon_socket_up{ceph_daemon="osd.1",hostname="testhost"} 0
+
+To identify any Ceph daemons that were not responsive at any point in the last 12 hours, you can use the following PromQL expression:
+
+.. code-block:: bash
+
+   ceph_daemon_socket_up == 0 or min_over_time(ceph_daemon_socket_up[12h]) == 0
+
 
 Performance metrics
 ===================
diff --git a/doc/rados/operations/erasure-code-jerasure.rst b/doc/rados/operations/erasure-code-jerasure.rst
index 8a0207748ae..c3717750754 100644
--- a/doc/rados/operations/erasure-code-jerasure.rst
+++ b/doc/rados/operations/erasure-code-jerasure.rst
@@ -60,6 +60,24 @@ Where:
               *blaum_roth*, *liber8tion* are *RAID6* equivalents in
               the sense that they can only be configured with *m=2*. 
 
+              .. note:: When using ``blaum_roth`` coding, the default 
+                 word size of ``w=7`` is suboptimal because ``blaum_roth`` 
+                 works best when ``w+1`` is prime. When creating a new 
+                 erasure-code profile with ``technique=blaum_roth``, 
+                 set ``w`` to a number that is one integer less than a prime 
+                 number (for example, ``6``). See `Loic Dachary's 
+                 commit f51d21b to ceph/ceph <https://github.com/ceph/ceph/commit/f51d21b53d26d4f27c950cb1ba3f989e713ab325>`_ for information about
+                 why this default cannot be changed easily in the
+                 source code, and see `the second bullet point on
+                 page 29 of Plank and Greenan's "Jerasure: A Library
+                 in C Facilitating Erasure Coding for Storage
+                 Applications" <https://github.com/ceph/jerasure/blob/master/Manual.pdf>`_ for an unequivocal statement of the restriction that applies 
+                 to ``w`` when using Blaum-Roth coding.
+                 (Information about the proper value of ``w`` when
+                 using ``blaum_roth`` coding was provided to the
+                 Ceph upstream in September of 2024 by Benjamin
+                 Mare.)
+
 :Type: String
 :Required: No.
 :Default: reed_sol_van
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst
index 81dafdf03e9..1d5bb342d74 100644
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -1502,10 +1502,10 @@ This health check is raised if a certain percentage (determined by
 :confval:`mon_warn_pg_not_deep_scrubbed_ratio`) of the interval has elapsed
 after the time the scrub was scheduled and no scrub has been performed.
 
-PGs will receive a deep scrub only if they are flagged as *clean* (which means
-that they are to be cleaned, and not that they have been examined and found to
-be clean). Misplaced or degraded PGs might not be flagged as ``clean`` (see
-*PG_AVAILABILITY* and *PG_DEGRADED* above).
+PGs will receive a deep scrub only if they are flagged as ``clean`` (which
+means that they are to be cleaned, and not that they have been examined and
+found to be clean). Misplaced or degraded PGs might not be flagged as ``clean``
+(see *PG_AVAILABILITY* and *PG_DEGRADED* above).
 
 This document offers two methods of setting the value of
 :confval:`osd_deep_scrub_interval`. The first method listed here changes the
diff --git a/doc/rados/troubleshooting/troubleshooting-pg.rst b/doc/rados/troubleshooting/troubleshooting-pg.rst
index f8b62113745..182b9ae4568 100644
--- a/doc/rados/troubleshooting/troubleshooting-pg.rst
+++ b/doc/rados/troubleshooting/troubleshooting-pg.rst
@@ -5,16 +5,16 @@
 Placement Groups Never Get Clean
 ================================
 
-If, after you have created your cluster, any Placement Groups (PGs) remain in
-the ``active`` status, the ``active+remapped`` status or the
-``active+degraded`` status and never achieves an ``active+clean`` status, you
-likely have a problem with your configuration.
+Placement Groups (PGs) that remain in the ``active`` status, the
+``active+remapped`` status or the ``active+degraded`` status and never achieve
+an ``active+clean`` status might indicate a problem with the configuration of
+the Ceph cluster. 
 
-In such a situation, it may be necessary to review the settings in the `Pool,
-PG and CRUSH Config Reference`_ and make appropriate adjustments.
+In such a situation, review the settings in the `Pool, PG and CRUSH Config
+Reference`_ and make appropriate adjustments.
 
 As a general rule, run your cluster with more than one OSD and a pool size
-greater than two object replicas.
+of greater than two object replicas.
 
 .. _one-node-cluster:
 
diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst
index c678784249f..edc6a90b0f9 100644
--- a/doc/radosgw/config-ref.rst
+++ b/doc/radosgw/config-ref.rst
@@ -149,7 +149,6 @@ file under each ``[client.radosgw.{instance-name}]`` instance.
 .. confval:: rgw_run_sync_thread
 .. confval:: rgw_data_log_window
 .. confval:: rgw_data_log_changes_size
-.. confval:: rgw_data_log_obj_prefix
 .. confval:: rgw_data_log_num_shards
 .. confval:: rgw_md_log_max_shards
 .. confval:: rgw_data_sync_poll_interval
diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst
index da92692fa8b..3085e1a528f 100644
--- a/doc/radosgw/index.rst
+++ b/doc/radosgw/index.rst
@@ -88,4 +88,4 @@ Cluster with one API and then retrieve that data with the other API.
    D3N Data Cache <d3n_datacache>
    Cloud Transition <cloud-transition>
    Metrics <metrics>
-
+   UADK Acceleration for Compression <uadk-accel>
diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst
index 6a21b7479e6..d6925c8ed9c 100644
--- a/doc/radosgw/multisite.rst
+++ b/doc/radosgw/multisite.rst
@@ -507,7 +507,7 @@ For example:
 Updating the Period
 -------------------
 
-After updating the master zone configuration, update the period:
+After updating the secondary zone configuration, update the period:
 
 .. prompt:: bash #
 
diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst
new file mode 100644
index 00000000000..fdf99f891f0
--- /dev/null
+++ b/doc/radosgw/uadk-accel.rst
@@ -0,0 +1,132 @@
+===============================================
+UADK Acceleration for Compression
+===============================================
+
+UADK is a framework for applications to access hardware accelerators in a
+unified, secure, and efficient way. UADK is comprised of UACCE, libwd and many
+other algorithm libraries.
+
+See `Compressor UADK Support`_.
+
+
+UADK in the Software Stack
+==========================
+
+UADK is a general-purpose user space accelerator framework that uses shared
+virtual addressing (SVA) to provide a unified programming interface for hardware
+acceleration of cryptographic and compression algorithms.
+
+UADK includes Unified/User-space-access-intended Accelerator Framework (UACCE),
+which enables hardware accelerators that support SVA to adapt to UADK.
+
+Currently, HiSilicon Kunpeng hardware accelerators have been registered with
+UACCE. Through the UADK framework, users can run cryptographic and compression
+algorithms using hardware accelerators instead of CPUs, freeing up CPU computing
+power and improving computing performance.
+
+A user can access the hardware accelerators by performing user-mode operations on
+the character devices, or the use of UADK can be done via frameworks that have
+been enabled by others including UADK support (for example, OpenSSL* libcrypto*,
+DPDK, and the Linux* Kernel Crypto Framework).
+
+See `OpenSSL UADK Engine`_.
+
+UADK Environment Setup
+======================
+UADK consists of UACCE, vendors’ drivers, and an algorithm layer. UADK requires the
+hardware accelerator to support SVA, and the operating system to support IOMMU and
+SVA. Hardware accelerators from different vendors are registered as different character
+devices with UACCE by using kernel-mode drivers of the vendors.
+
+::
+
+          +----------------------------------+
+          |                apps              |
+          +----+------------------------+----+
+               |                        |
+               |                        |
+       +-------+--------+       +-------+-------+
+       |   scheduler    |       | alg libraries |
+       +-------+--------+       +-------+-------+
+               |                         |
+               |                         |
+               |                         |
+               |                +--------+------+
+               |                | vendor drivers|
+               |                +-+-------------+
+               |                  |
+               |                  |
+            +--+------------------+--+
+            |         libwd          |
+    User    +----+-------------+-----+
+    --------------------------------------------------
+    Kernel    +--+-----+   +------+
+              | uacce  |   | smmu |
+              +---+----+   +------+
+                  |
+              +---+------------------+
+              | vendor kernel driver |
+              +----------------------+
+    --------------------------------------------------
+             +----------------------+
+             |   HW Accelerators    |
+             +----------------------+
+
+Configuration
+=============
+
+#. Kernel Requirement
+
+User needs to make sure that UACCE is already supported in Linux kernel. The kernel version
+should be at least v5.9 with SVA (Shared Virtual Addressing) enabled.
+
+UACCE may be built as a module or built into the kernel. Here's an example to build UACCE
+with hardware accelerators for the HiSilicon Kunpeng platform.
+
+    .. prompt:: bash $
+
+       CONFIG_IOMMU_SVA_LIB=y
+       CONFIG_ARM_SMMU=y
+       CONFIG_ARM_SMMU_V3=y
+       CONFIG_ARM_SMMU_V3_SVA=y
+       CONFIG_PCI_PASID=y
+       CONFIG_UACCE=y
+       CONFIG_CRYPTO_DEV_HISI_QM=y
+       CONFIG_CRYPTO_DEV_HISI_ZIP=y
+
+Make sure all these above kernel configurations are selected.
+
+#. UADK enablement
+If the architecture is aarch64, it will automatically download the UADK source code to build
+the static library. If it runs on other architecture, user can enable it with build parameters
+`-DWITH_UADK=true`
+
+#. Manual Build UADK
+As the above paragraph shows, the UADK is enabled automatically, no need to build manually.
+For developer who is interested in UADK, you can refer to the below steps for building.
+
+   .. prompt:: bash $ 
+
+      git clone https://github.com/Linaro/uadk.git
+      cd uadk
+      mkdir build
+      ./autogen.sh
+      ./configure --prefix=$PWD/build
+      make
+      make install
+
+   .. note:: Without –prefix, UADK will be installed to /usr/local/lib by
+             default. If get error:"cannot find -lnuma", please install 
+             the `libnuma-dev`.
+
+#. Configure
+
+   Edit the Ceph configuration file (usually ``ceph.conf``) to enable UADK
+   support for *zlib* compression::
+
+         uadk_compressor_enabled=true
+
+   The default value in `global.yaml.in` for `uadk_compressor_enabled` is false.
+
+.. _Compressor UADK Support: https://github.com/ceph/ceph/pull/58336
+.. _OpenSSL UADK Engine: https://github.com/Linaro/uadk_engine
diff --git a/doc/rbd/rbd-mirroring.rst b/doc/rbd/rbd-mirroring.rst
index 2c9cc1b66bf..add0e9503b0 100644
--- a/doc/rbd/rbd-mirroring.rst
+++ b/doc/rbd/rbd-mirroring.rst
@@ -23,18 +23,21 @@ capability is available in two modes:
   blocks can be quickly determined without the need to scan the full RBD image.
   Since this mode is not as fine-grained as journaling, the complete delta 
   between two snapshots will need to be synced prior to use during a failover
-  scenario. Any partially applied set of deltas will be rolled back at moment
-  of failover.
+  scenario. Any partially applied set of deltas will be rolled back at the 
+  moment of failover.
 
 .. note:: journal-based mirroring requires the Ceph Jewel release or later;
    snapshot-based mirroring requires the Ceph Octopus release or later.
 
+.. note:: All instances of the term "namespace" in this document refer to RBD
+   namespaces.
+
 Mirroring is configured on a per-pool basis within peer clusters and can be
-configured on a specific subset of images within the pool.  You can also mirror
-all images within a given pool when using journal-based
-mirroring. Mirroring is configured using the ``rbd`` command. The
-``rbd-mirror`` daemon is responsible for pulling image updates from the remote
-peer cluster and applying them to the image within the local cluster.
+configured on a namespace or specific subset of images within the pool or
+namespace. You can also mirror all images within a given pool or namespace when
+using journal-based mirroring. Mirroring is configured using the ``rbd``
+command. The ``rbd-mirror`` daemon is responsible for pulling image updates from
+the remote peer cluster and applying them to the image within the local cluster.
 
 Depending on the desired needs for replication, RBD mirroring can be configured
 for either one- or two-way replication:
@@ -231,6 +234,57 @@ pool as follows:
    same name exists on the destination cluster, that pool will be used.
 #. If neither of the above is true, no data pool will be set.
 
+Namespace Configuration
+=======================
+
+Mirroring can be configured on a namespace in a pool. The pool must already
+have been configured for mirroring. The namespace can be mirrored to a namespace
+with the same or a different name in the remote pool.
+
+Enable Mirroring
+----------------
+
+To enable mirroring on a namespace with ``rbd``, issue the ``mirror pool enable``
+subcommand with the namespace spec and the mirroring mode, and an optional
+remote namespace name::
+
+        rbd mirror pool enable {pool-name}/{local-namespace-name} {mode} [--remote-namespace {remote-namespace-name}]
+
+The mirroring mode can either be ``image`` or ``pool``:
+
+* **image**: When configured in ``image`` mode, mirroring must
+  `explicitly enabled`_ on each image.
+* **pool** (default):  When configured in ``pool`` mode, all images in the namespace
+  with the journaling feature enabled are mirrored.
+
+For example::
+
+        $ rbd --cluster site-a mirror pool enable image-pool/namespace-a image --remote-namespace namespace-b
+        $ rbd --cluster site-b mirror pool enable image-pool/namespace-b image --remote-namespace namespace-a
+
+This will set up image mode mirroring between image-pool/namespace-a on cluster
+site-a and image-pool/namespace-b on cluster site-b.
+The namespace and remote-namespace pair configured on a cluster must
+match the remote-namespace and namespace respectively on the remote cluster.
+If the ``--remote-namespace`` option is not provided, the namespace will be
+mirrored to a namespace with the same name in the remote pool.
+
+Disable Mirroring
+-----------------
+
+To disable mirroring on a namespace with ``rbd``, specify the ``mirror pool disable``
+command and the namespace spec::
+
+        rbd mirror pool disable {pool-name}/{namespace-name}
+
+When configured in ``image`` mode, any mirror enabled images in the namespace
+must be explicitly disabled before disabling mirroring on the namespace.
+
+For example::
+
+        $ rbd --cluster site-a mirror pool disable image-pool/namespace-a
+        $ rbd --cluster site-b mirror pool disable image-pool/namespace-b
+
 Image Configuration
 ===================
 
diff --git a/doc/releases/index.rst b/doc/releases/index.rst
index 8a84e194896..fe816c31cca 100644
--- a/doc/releases/index.rst
+++ b/doc/releases/index.rst
@@ -21,6 +21,7 @@ security fixes.
    :maxdepth: 1
    :hidden:
 
+   Squid (v19.2.*) <squid>
    Reef (v18.2.*) <reef>
    Quincy (v17.2.*) <quincy>
 
@@ -59,8 +60,11 @@ receive bug fixes or backports).
 Release timeline
 ----------------
 
-.. ceph_timeline_gantt:: releases.yml reef quincy
-.. ceph_timeline:: releases.yml reef quincy
+.. ceph_timeline_gantt:: releases.yml squid reef quincy
+.. ceph_timeline:: releases.yml squid reef quincy
+
+.. _Squid: squid
+.. _19.2.0: squid#v19-2-0-squid
 
 .. _Reef: reef
 .. _18.2.0: reef#v18-2-0-reef
diff --git a/doc/releases/reef.rst b/doc/releases/reef.rst
index ca6f2ca4fc9..ed11cdcc9bf 100644
--- a/doc/releases/reef.rst
+++ b/doc/releases/reef.rst
@@ -21,6 +21,11 @@ may encounter crashes during `pthread_create`. For workarounds, refer to the rel
 upgrading your OS to avoid this unsupported combination.
 Related tracker: https://tracker.ceph.com/issues/66989
 
+Release Date
+------------
+
+July 24, 2024
+
 Notable Changes
 ---------------
 
@@ -445,6 +450,11 @@ v18.2.2 Reef
 
 This is a hotfix release that resolves several flaws including Prometheus crashes and an encoder fix.
 
+Release Date
+------------
+
+March 11, 2024
+
 Notable Changes
 ---------------
 
@@ -463,6 +473,11 @@ v18.2.1 Reef
 This is the first backport release in the Reef series, and the first with Debian packages,
 for Debian Bookworm. We recommend that all users update to this release.
 
+Release Date
+------------
+
+December 18, 2023
+
 Notable Changes
 ---------------
 
@@ -963,6 +978,11 @@ This is the first stable release of Ceph Reef.
 
    *last updated 2023 Aug 04*
 
+Release Date
+------------
+
+August 7, 2023
+
 Major Changes from Quincy
 --------------------------
 
diff --git a/doc/releases/releases.yml b/doc/releases/releases.yml
index d6a18389567..77123eb7135 100644
--- a/doc/releases/releases.yml
+++ b/doc/releases/releases.yml
@@ -12,6 +12,12 @@
 # If a version might represent an actual number (e.g. 0.80) quote it.
 #
 releases:
+  squid:
+    target_eol: 2026-09-19
+    releases:
+      - version: 19.2.0
+        released: 2024-09-26
+
   reef:
     target_eol: 2025-08-01
     releases:
diff --git a/doc/releases/squid.rst b/doc/releases/squid.rst
new file mode 100644
index 00000000000..8f0d3b16393
--- /dev/null
+++ b/doc/releases/squid.rst
@@ -0,0 +1,611 @@
+=====
+Squid
+=====
+
+Squid is the 19th stable release of Ceph.
+
+v19.2.0 Squid
+=============
+
+.. ATTENTION::
+   iSCSI users are advised that the upstream developers of Ceph encountered a
+   bug during an upgrade from Ceph 19.1.1 to Ceph 19.2.0. Read `Tracker Issue
+   68215 <https://tracker.ceph.com/issues/68215>`_ before attempting an upgrade
+   to 19.2.0.
+
+Highlights
+~~~~~~~~~~
+
+RADOS
+
+* BlueStore has been optimized for better performance in snapshot-intensive workloads.
+* BlueStore RocksDB LZ4 compression is now enabled by default to improve average performance
+  and "fast device" space usage.
+* Other improvements include more flexible EC configurations, an OpTracker to help debug mgr
+  module issues, and better scrub scheduling.
+
+Dashboard
+
+* Improved navigation layout
+* Support for managing CephFS snapshots and clones, as well as snapshot schedule management
+* Manage authorization capabilities for CephFS resources
+* Helpers on mounting a CephFS volume
+
+RBD
+
+* diff-iterate can now execute locally, bringing a dramatic performance improvement for QEMU
+  live disk synchronization and backup use cases.
+* Support for cloning from non-user type snapshots is added.
+* rbd-wnbd driver has gained the ability to multiplex image mappings.
+
+RGW
+
+* The User Accounts feature unlocks several new AWS-compatible IAM APIs for the self-service
+  management of users, keys, groups, roles, policy and more.
+
+Crimson/Seastore
+
+* Crimson's first tech preview release! Supporting RBD workloads on Replicated pools. For more
+  information please visit: https://ceph.io/en/news/crimson
+
+Ceph
+~~~~
+
+* ceph: a new `--daemon-output-file` switch is available for `ceph tell`
+  commands to dump output to a file local to the daemon. For commands which
+  produce large amounts of output, this avoids a potential spike in memory
+  usage on the daemon, allows for faster streaming writes to a file local to
+  the daemon, and reduces time holding any locks required to execute the
+  command. For analysis, it is necessary to manually retrieve the file from the host
+  running the daemon. Currently, only ``--format=json|json-pretty``
+  are supported.
+* ``cls_cxx_gather`` is marked as deprecated.
+* Tracing: The blkin tracing feature (see
+  https://docs.ceph.com/en/reef/dev/blkin/) is now deprecated in favor of
+  Opentracing
+  (https://docs.ceph.com/en/reef/dev/developer_guide/jaegertracing/) and will
+  be removed in a later release.
+* PG dump: The default output of ``ceph pg dump --format json`` has changed.
+  The default JSON format produces a rather massive output in large clusters
+  and isn't scalable, so we have removed the 'network_ping_times' section from
+  the output. Details in the tracker: https://tracker.ceph.com/issues/57460
+
+CephFS
+~~~~~~
+
+* CephFS: it is now possible to pause write I/O and metadata mutations on a
+  tree in the file system using a new suite of subvolume quiesce commands.
+  This is implemented to support crash-consistent snapshots for distributed
+  applications. Please see the relevant section in the documentation on CephFS
+  subvolumes for more information.
+* CephFS: MDS evicts clients which are not advancing their request tids which
+  causes a large buildup of session metadata resulting in the MDS going
+  read-only due to the RADOS operation exceeding the size threshold.
+  `mds_session_metadata_threshold` config controls the maximum size that a
+  (encoded) session metadata can grow.
+* CephFS: A new "mds last-seen" command is available for querying the last time
+  an MDS was in the FSMap, subject to a pruning threshold.
+* CephFS: For clusters with multiple CephFS file systems, all the snap-schedule
+  commands now expect the '--fs' argument.
+* CephFS: The period specifier ``m`` now implies minutes and the period
+  specifier ``M`` now implies months. This has been made consistent with the
+  rest of the system.
+* CephFS: Running the command "ceph fs authorize" for an existing entity now
+  upgrades the entity's capabilities instead of printing an error. It can now
+  also change read/write permissions in a capability that the entity already
+  holds. If the capability passed by user is same as one of the capabilities
+  that the entity already holds, idempotency is maintained.
+* CephFS: Two FS names can now be swapped, optionally along with their IDs,
+  using "ceph fs swap" command. The function of this API is to facilitate
+  file system swaps for disaster recovery. In particular, it avoids situations
+  where a named file system is temporarily missing which would prompt a higher
+  level storage operator (like Rook) to recreate the missing file system.
+  See https://docs.ceph.com/en/latest/cephfs/administration/#file-systems
+  docs for more information.
+* CephFS: Before running the command "ceph fs rename", the filesystem to be
+  renamed must be offline and the config "refuse_client_session" must be set
+  for it. The config "refuse_client_session" can be removed/unset and
+  filesystem can be online after the rename operation is complete.
+* CephFS: Disallow delegating preallocated inode ranges to clients. Config
+  `mds_client_delegate_inos_pct` defaults to 0 which disables async dirops
+  in the kclient.
+* CephFS: MDS log trimming is now driven by a separate thread which tries to
+  trim the log every second (`mds_log_trim_upkeep_interval` config). Also, a
+  couple of configs govern how much time the MDS spends in trimming its logs.
+  These configs are `mds_log_trim_threshold` and `mds_log_trim_decay_rate`.
+* CephFS: Full support for subvolumes and subvolume groups is now available
+* CephFS: The `subvolume snapshot clone` command now depends on the config
+  option `snapshot_clone_no_wait` which is used to reject the clone operation
+  when all the cloner threads are busy. This config option is enabled by
+  default which means that if no cloner threads are free, the clone request
+  errors out with EAGAIN.  The value of the config option can be fetched by
+  using: `ceph config get mgr mgr/volumes/snapshot_clone_no_wait` and it can be
+  disabled by using: `ceph config set mgr mgr/volumes/snapshot_clone_no_wait
+  false`
+  for snap_schedule Manager module.
+* CephFS: Commands ``ceph mds fail`` and ``ceph fs fail`` now require a
+  confirmation flag when some MDSs exhibit health warning MDS_TRIM or
+  MDS_CACHE_OVERSIZED. This is to prevent accidental MDS failover causing
+  further delays in recovery.
+* CephFS: fixes to the implementation of the ``root_squash`` mechanism enabled
+  via cephx ``mds`` caps on a client credential require a new client feature
+  bit, ``client_mds_auth_caps``. Clients using credentials with ``root_squash``
+  without this feature will trigger the MDS to raise a HEALTH_ERR on the
+  cluster, MDS_CLIENTS_BROKEN_ROOTSQUASH. See the documentation on this warning
+  and the new feature bit for more information.
+* CephFS: Expanded removexattr support for cephfs virtual extended attributes.
+  Previously one had to use setxattr to restore the default in order to
+  "remove".  You may now properly use removexattr to remove. You can also now
+  remove layout on root inode, which then will restore layout to default
+  layout.
+* CephFS: cephfs-journal-tool is guarded against running on an online file
+  system.  The 'cephfs-journal-tool --rank <fs_name>:<mds_rank> journal reset'
+  and 'cephfs-journal-tool --rank <fs_name>:<mds_rank> journal reset --force'
+  commands require '--yes-i-really-really-mean-it'.
+* CephFS: "ceph fs clone status" command will now print statistics about clone
+  progress in terms of how much data has been cloned (in both percentage as
+  well as bytes) and how many files have been cloned.
+* CephFS: "ceph status" command will now print a progress bar when cloning is
+  ongoing. If clone jobs are more than the cloner threads, it will print one
+  more progress bar that shows total amount of progress made by both ongoing
+  as well as pending clones. Both progress are accompanied by messages that
+  show number of clone jobs in the respective categories and the amount of
+  progress made by each of them.
+* cephfs-shell: The cephfs-shell utility is now packaged for RHEL 9 / CentOS 9
+  as required python dependencies are now available in EPEL9.
+* The CephFS automatic metadata load (sometimes called "default") balancer is
+  now disabled by default. The new file system flag `balance_automate`
+  can be used to toggle it on or off. It can be enabled or disabled via
+  `ceph fs set <fs_name> balance_automate <bool>`.
+
+CephX
+~~~~~
+
+* cephx: key rotation is now possible using `ceph auth rotate`. Previously,
+  this was only possible by deleting and then recreating the key.
+
+Dashboard
+~~~~~~~~~
+
+* Dashboard: Rearranged Navigation Layout: The navigation layout has been reorganized for improved usability and easier access to key features.
+* Dashboard: CephFS Improvments
+  * Support for managing CephFS snapshots and clones, as well as snapshot schedule management
+  * Manage authorization capabilities for CephFS resources
+  * Helpers on mounting a CephFS volume
+* Dashboard: RGW Improvements
+  * Support for managing bucket policies
+  * Add/Remove bucket tags
+  * ACL Management
+  * Several UI/UX Improvements to the bucket form
+
+MGR
+~~~
+
+* MGR/REST: The REST manager module will trim requests based on the
+  'max_requests' option.  Without this feature, and in the absence of manual
+  deletion of old requests, the accumulation of requests in the array can lead
+  to Out Of Memory (OOM) issues, resulting in the Manager crashing.
+* MGR: An OpTracker to help debug mgr module issues is now available.
+
+Monitoring
+~~~~~~~~~~
+
+* Monitoring: Grafana dashboards are now loaded into the container at runtime
+  rather than building a grafana image with the grafana dashboards. Official
+  Ceph grafana images can be found in quay.io/ceph/grafana
+* Monitoring: RGW S3 Analytics: A new Grafana dashboard is now available,
+  enabling you to visualize per bucket and user analytics data, including total
+  GETs, PUTs, Deletes, Copies, and list metrics.
+* The ``mon_cluster_log_file_level`` and ``mon_cluster_log_to_syslog_level``
+  options have been removed. Henceforth, users should use the new generic
+  option ``mon_cluster_log_level`` to control the cluster log level verbosity
+  for the cluster log file as well as for all external entities.
+
+RADOS
+~~~~~
+
+* RADOS: ``A POOL_APP_NOT_ENABLED`` health warning will now be reported if the
+  application is not enabled for the pool irrespective of whether the pool is
+  in use or not. Always tag a pool with an application using ``ceph osd pool
+  application enable`` command to avoid reporting of POOL_APP_NOT_ENABLED
+  health warning for that pool. The user might temporarily mute this warning
+  using ``ceph health mute POOL_APP_NOT_ENABLED``.
+* RADOS: `get_pool_is_selfmanaged_snaps_mode` C++ API has been deprecated due
+  to being prone to false negative results.  Its safer replacement is
+  `pool_is_in_selfmanaged_snaps_mode`.
+* RADOS: For bug 62338 (https://tracker.ceph.com/issues/62338), we did not
+  choose to condition the fix on a server flag in order to simplify
+  backporting.  As a result, in rare cases it may be possible for a PG to flip
+  between two acting sets while an upgrade to a version with the fix is in
+  progress.  If you observe this behavior, you should be able to work around it
+  by completing the upgrade or by disabling async recovery by setting
+  osd_async_recovery_min_cost to a very large value on all OSDs until the
+  upgrade is complete: ``ceph config set osd osd_async_recovery_min_cost
+  1099511627776``
+* RADOS: A detailed version of the `balancer status` CLI command in the
+  balancer module is now available. Users may run `ceph balancer status detail`
+  to see more details about which PGs were updated in the balancer's last
+  optimization.  See https://docs.ceph.com/en/latest/rados/operations/balancer/
+  for more information.
+* RADOS: Read balancing may now be managed automatically via the balancer
+  manager module. Users may choose between two new modes: ``upmap-read``, which
+  offers upmap and read optimization simultaneously, or ``read``, which may be
+  used to only optimize reads. For more detailed information see
+  https://docs.ceph.com/en/latest/rados/operations/read-balancer/#online-optimization.
+* RADOS: BlueStore has been optimized for better performance in snapshot-intensive workloads.
+* RADOS: BlueStore RocksDB LZ4 compression is now enabled by default to improve average
+  performance and "fast device" space usage.
+* RADOS: A new CRUSH rule type, MSR (Multi-Step Retry), allows for more flexible EC
+  configurations.
+* RADOS: Scrub scheduling behavior has been improved.
+
+Crimson/Seastore
+~~~~~~~~~~~~~~~~
+
+* Crimson's first tech preview release!
+  Supporting RBD workloads on Replicated pools.
+  For more information please visit: https://ceph.io/en/news/crimson
+
+RBD
+~~~
+
+* RBD: When diffing against the beginning of time (`fromsnapname == NULL`) in
+  fast-diff mode (`whole_object == true` with ``fast-diff`` image feature enabled
+  and valid), diff-iterate is now guaranteed to execute locally if exclusive
+  lock is available.  This brings a dramatic performance improvement for QEMU
+  live disk synchronization and backup use cases.
+* RBD: The ``try-netlink`` mapping option for rbd-nbd has become the default
+  and is now deprecated. If the NBD netlink interface is not supported by the
+  kernel, then the mapping is retried using the legacy ioctl interface.
+* RBD: The option ``--image-id`` has been added to `rbd children` CLI command,
+  so it can be run for images in the trash.
+* RBD: `Image::access_timestamp` and `Image::modify_timestamp` Python APIs now
+  return timestamps in UTC.
+* RBD: Support for cloning from non-user type snapshots is added.  This is
+  intended primarily as a building block for cloning new groups from group
+  snapshots created with `rbd group snap create` command, but has also been
+  exposed via the new `--snap-id` option for `rbd clone` command.
+* RBD: The output of `rbd snap ls --all` command now includes the original
+  type for trashed snapshots.
+* RBD: `RBD_IMAGE_OPTION_CLONE_FORMAT` option has been exposed in Python
+  bindings via `clone_format` optional parameter to `clone`, `deep_copy` and
+  `migration_prepare` methods.
+* RBD: `RBD_IMAGE_OPTION_FLATTEN` option has been exposed in Python bindings
+  via `flatten` optional parameter to `deep_copy` and `migration_prepare`
+  methods.
+* RBD: `rbd-wnbd` driver has gained the ability to multiplex image mappings.
+  Previously, each image mapping spawned its own `rbd-wnbd` daemon, which lead
+  to an excessive amount of TCP sessions and other resources being consumed,
+  eventually exceeding Windows limits.  With this change, a single `rbd-wnbd`
+  daemon is spawned per host and most OS resources are shared between image
+  mappings.  Additionally, `ceph-rbd` service starts much faster.
+
+RGW
+~~~
+
+* RGW: GetObject and HeadObject requests now return a x-rgw-replicated-at
+  header for replicated objects. This timestamp can be compared against the
+  Last-Modified header to determine how long the object took to replicate.
+* RGW: S3 multipart uploads using Server-Side Encryption now replicate
+  correctly in multi-site. Previously, the replicas of such objects were
+  corrupted on decryption.  A new tool, ``radosgw-admin bucket resync encrypted
+  multipart``, can be used to identify these original multipart uploads. The
+  ``LastModified`` timestamp of any identified object is incremented by 1ns to
+  cause peer zones to replicate it again.  For multi-site deployments that make
+  any use of Server-Side Encryption, we recommended running this command
+  against every bucket in every zone after all zones have upgraded.
+* RGW: Introducing a new data layout for the Topic metadata associated with S3
+  Bucket Notifications, where each Topic is stored as a separate RADOS object
+  and the bucket notification configuration is stored in a bucket attribute.
+  This new representation supports multisite replication via metadata sync and
+  can scale to many topics. This is on by default for new deployments, but is
+  not enabled by default on upgrade. Once all radosgws have upgraded (on all
+  zones in a multisite configuration), the ``notification_v2`` zone feature can
+  be enabled to migrate to the new format. See
+  https://docs.ceph.com/en/squid/radosgw/zone-features for details. The "v1"
+  format is now considered deprecated and may be removed after 2 major releases.
+* RGW: New tools have been added to radosgw-admin for identifying and
+  correcting issues with versioned bucket indexes. Historical bugs with the
+  versioned bucket index transaction workflow made it possible for the index
+  to accumulate extraneous "book-keeping" olh entries and plain placeholder
+  entries. In some specific scenarios where clients made concurrent requests
+  referencing the same object key, it was likely that a lot of extra index
+  entries would accumulate. When a significant number of these entries are
+  present in a single bucket index shard, they can cause high bucket listing
+  latencies and lifecycle processing failures. To check whether a versioned
+  bucket has unnecessary olh entries, users can now run ``radosgw-admin
+  bucket check olh``. If the ``--fix`` flag is used, the extra entries will
+  be safely removed. A distinct issue from the one described thus far, it is
+  also possible that some versioned buckets are maintaining extra unlinked
+  objects that are not listable from the S3/ Swift APIs. These extra objects
+  are typically a result of PUT requests that exited abnormally, in the middle
+  of a bucket index transaction - so the client would not have received a
+  successful response. Bugs in prior releases made these unlinked objects easy
+  to reproduce with any PUT request that was made on a bucket that was actively
+  resharding. Besides the extra space that these hidden, unlinked objects
+  consume, there can be another side effect in certain scenarios, caused by
+  the nature of the failure mode that produced them, where a client of a bucket
+  that was a victim of this bug may find the object associated with the key to
+  be in an inconsistent state. To check whether a versioned bucket has unlinked
+  entries, users can now run ``radosgw-admin bucket check unlinked``. If the
+  ``--fix`` flag is used, the unlinked objects will be safely removed. Finally,
+  a third issue made it possible for versioned bucket index stats to be
+  accounted inaccurately. The tooling for recalculating versioned bucket stats
+  also had a bug, and was not previously capable of fixing these inaccuracies.
+  This release resolves those issues and users can now expect that the existing
+  ``radosgw-admin bucket check`` command will produce correct results. We
+  recommend that users with versioned buckets, especially those that existed
+  on prior releases, use these new tools to check whether their buckets are
+  affected and to clean them up accordingly.
+* RGW: The User Accounts feature unlocks several new AWS-compatible IAM APIs
+  for the self-service management of users, keys, groups, roles, policy and
+  more. Existing users can be adopted into new accounts. This process is
+  optional but irreversible. See https://docs.ceph.com/en/squid/radosgw/account
+  and https://docs.ceph.com/en/squid/radosgw/iam for details.
+* RGW: On startup, radosgw and radosgw-admin now validate the ``rgw_realm``
+  config option. Previously, they would ignore invalid or missing realms and go
+  on to load a zone/zonegroup in a different realm. If startup fails with a
+  "failed to load realm" error, fix or remove the ``rgw_realm`` option.
+* RGW: The radosgw-admin commands ``realm create`` and ``realm pull`` no longer
+  set the default realm without ``--default``.
+* RGW: Fixed an S3 Object Lock bug with PutObjectRetention requests that
+  specify a RetainUntilDate after the year 2106. This date was truncated to 32
+  bits when stored, so a much earlier date was used for object lock
+  enforcement.  This does not effect PutBucketObjectLockConfiguration where a
+  duration is given in Days.  The RetainUntilDate encoding is fixed for new
+  PutObjectRetention requests, but cannot repair the dates of existing object
+  locks. Such objects can be identified with a HeadObject request based on the
+  x-amz-object-lock-retain-until-date response header.
+* S3 ``Get/HeadObject`` now supports the query parameter ``partNumber`` to read
+  a specific part of a completed multipart upload.
+* RGW: The SNS CreateTopic API now enforces the same topic naming requirements
+  as AWS: Topic names must be made up of only uppercase and lowercase ASCII
+  letters, numbers, underscores, and hyphens, and must be between 1 and 256
+  characters long.
+* RGW: Notification topics are now owned by the user that created them.  By
+  default, only the owner can read/write their topics. Topic policy documents
+  are now supported to grant these permissions to other users. Preexisting
+  topics are treated as if they have no owner, and any user can read/write them
+  using the SNS API.  If such a topic is recreated with CreateTopic, the
+  issuing user becomes the new owner.  For backward compatibility, all users
+  still have permission to publish bucket notifications to topics owned by
+  other users. A new configuration parameter,
+  ``rgw_topic_require_publish_policy``, can be enabled to deny ``sns:Publish``
+  permissions unless explicitly granted by topic policy.
+* RGW: Fix issue with persistent notifications where the changes to topic param
+  that were modified while persistent notifications were in the queue will be
+  reflected in notifications.  So if the user sets up topic with incorrect config
+  (password/ssl) causing failure while delivering the notifications to broker,
+  can now modify the incorrect topic attribute and on retry attempt to delivery
+  the notifications, new configs will be used.
+* RGW: in bucket notifications, the ``principalId`` inside ``ownerIdentity``
+  now contains the complete user ID, prefixed with the tenant ID.
+
+Telemetry
+~~~~~~~~~
+
+* The ``basic`` channel in telemetry now captures pool flags that allows us to
+  better understand feature adoption, such as Crimson. 
+  To opt in to telemetry, run ``ceph telemetry on``.
+
+Upgrading from Quincy or Reef
+--------------------------------
+
+Before starting, make sure your cluster is stable and healthy (no down or recovering OSDs).
+(This is optional, but recommended.) You can disable the autoscaler for all pools during the
+upgrade using the noautoscale flag.
+
+.. note::
+
+   You can monitor the progress of your upgrade at each stage with the ``ceph versions`` command, which will tell you what ceph version(s) are running for each type of daemon.
+
+Upgrading cephadm clusters
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your cluster is deployed with cephadm (first introduced in Octopus), then the upgrade process is entirely automated. To initiate the upgrade,
+
+  .. prompt:: bash #
+
+    ceph orch upgrade start --image quay.io/ceph/ceph:v19.2.0
+
+The same process is used to upgrade to future minor releases.
+
+Upgrade progress can be monitored with
+
+  .. prompt:: bash #
+
+    ceph orch upgrade status
+
+Upgrade progress can also be monitored with `ceph -s` (which provides a simple progress bar) or more verbosely with
+
+  .. prompt:: bash #
+
+    ceph -W cephadm
+
+The upgrade can be paused or resumed with
+
+  .. prompt:: bash #
+
+    ceph orch upgrade pause  # to pause
+    ceph orch upgrade resume # to resume
+
+or canceled with
+
+.. prompt:: bash #
+
+    ceph orch upgrade stop
+
+Note that canceling the upgrade simply stops the process; there is no ability to downgrade back to Quincy or Reef.
+
+Upgrading non-cephadm clusters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+
+   1. If your cluster is running Quincy (17.2.x) or later, you might choose to first convert it to use cephadm so that the upgrade to Squid is automated (see above).
+      For more information, see https://docs.ceph.com/en/squid/cephadm/adoption/.
+
+   2. If your cluster is running Quincy (17.2.x) or later, systemd unit file names have changed to include the cluster fsid. To find the correct systemd unit file name for your cluster, run following command:
+
+      ```
+      systemctl -l | grep <daemon type>
+      ```
+
+      Example:
+
+      ```
+      $ systemctl -l | grep mon | grep active
+      ceph-6ce0347c-314a-11ee-9b52-000af7995d6c@mon.f28-h21-000-r630.service                                           loaded active running   Ceph mon.f28-h21-000-r630 for 6ce0347c-314a-11ee-9b52-000af7995d6c
+      ```
+
+#. Set the `noout` flag for the duration of the upgrade. (Optional, but recommended.)
+
+   .. prompt:: bash #
+
+      ceph osd set noout
+
+#. Upgrade monitors by installing the new packages and restarting the monitor daemons. For example, on each monitor host
+
+   .. prompt:: bash #
+
+      systemctl restart ceph-mon.target
+
+   Once all monitors are up, verify that the monitor upgrade is complete by looking for the `squid` string in the mon map. The command
+
+   .. prompt:: bash #
+
+      ceph mon dump | grep min_mon_release
+
+   should report:
+
+   .. prompt:: bash #
+
+      min_mon_release 19 (squid)
+
+   If it does not, that implies that one or more monitors hasn't been upgraded and restarted and/or the quorum does not include all monitors.
+
+#. Upgrade `ceph-mgr` daemons by installing the new packages and restarting all manager daemons. For example, on each manager host,
+
+   .. prompt:: bash #
+
+      systemctl restart ceph-mgr.target
+
+   Verify the `ceph-mgr` daemons are running by checking `ceph -s`:
+
+   .. prompt:: bash #
+
+      ceph -s
+
+   ::
+
+     ...
+       services:
+        mon: 3 daemons, quorum foo,bar,baz
+        mgr: foo(active), standbys: bar, baz
+     ...
+
+#. Upgrade all OSDs by installing the new packages and restarting the ceph-osd daemons on all OSD hosts
+
+   .. prompt:: bash #
+
+      systemctl restart ceph-osd.target
+
+#. Upgrade all CephFS MDS daemons. For each CephFS file system,
+
+   #. Disable standby_replay:
+
+         .. prompt:: bash #
+
+            ceph fs set <fs_name> allow_standby_replay false
+
+   #. Reduce the number of ranks to 1. (Make note of the original number of MDS daemons first if you plan to restore it later.)
+
+      .. prompt:: bash #
+
+         ceph status # ceph fs set <fs_name> max_mds 1
+
+   #. Wait for the cluster to deactivate any non-zero ranks by periodically checking the status
+
+      .. prompt:: bash #
+
+         ceph status
+
+   #. Take all standby MDS daemons offline on the appropriate hosts with
+
+      .. prompt:: bash #
+
+         systemctl stop ceph-mds@<daemon_name>
+
+   #. Confirm that only one MDS is online and is rank 0 for your FS
+
+      .. prompt:: bash #
+
+         ceph status
+
+   #. Upgrade the last remaining MDS daemon by installing the new packages and restarting the daemon
+
+      .. prompt:: bash #
+
+         systemctl restart ceph-mds.target
+
+   #. Restart all standby MDS daemons that were taken offline
+
+      .. prompt:: bash #
+
+         systemctl start ceph-mds.target
+
+   #. Restore the original value of `max_mds` for the volume
+
+      .. prompt:: bash #
+
+         ceph fs set <fs_name> max_mds <original_max_mds>
+
+#. Upgrade all radosgw daemons by upgrading packages and restarting daemons on all hosts
+
+   .. prompt:: bash #
+
+      systemctl restart ceph-radosgw.target
+
+#. Complete the upgrade by disallowing pre-Squid OSDs and enabling all new Squid-only functionality
+
+   .. prompt:: bash #
+
+      ceph osd require-osd-release squid
+
+#. If you set `noout` at the beginning, be sure to clear it with
+
+   .. prompt:: bash #
+
+      ceph osd unset noout
+
+#. Consider transitioning your cluster to use the cephadm deployment and orchestration framework to simplify
+   cluster management and future upgrades. For more information on converting an existing cluster to cephadm,
+   see https://docs.ceph.com/en/squid/cephadm/adoption/.
+
+Post-upgrade
+~~~~~~~~~~~~
+
+#. Verify the cluster is healthy with `ceph health`. If your cluster is running Filestore, and you are upgrading directly from Quincy to Squid, a deprecation warning is expected. This warning can be temporarily muted using the following command
+
+   .. prompt:: bash #
+
+      ceph health mute OSD_FILESTORE
+
+#. Consider enabling the `telemetry module <https://docs.ceph.com/en/squid/mgr/telemetry/>`_ to send anonymized usage statistics and crash information to the Ceph upstream developers. To see what would be reported (without actually sending any information to anyone),
+
+   .. prompt:: bash #
+
+      ceph telemetry preview-all
+
+   If you are comfortable with the data that is reported, you can opt-in to automatically report the high-level cluster metadata with
+
+   .. prompt:: bash #
+
+      ceph telemetry on
+
+   The public dashboard that aggregates Ceph telemetry can be found at https://telemetry-public.ceph.com/.
+
+Upgrading from pre-Quincy releases (like Pacific)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You **must** first upgrade to Quincy (17.2.z) or Reef (18.2.z) before upgrading to Squid.
diff --git a/doc/start/beginners-guide.rst b/doc/start/beginners-guide.rst
index 6440e98834b..eadf6b0cfde 100644
--- a/doc/start/beginners-guide.rst
+++ b/doc/start/beginners-guide.rst
@@ -8,6 +8,9 @@ Ceph is a clustered and distributed storage manager. If that's too cryptic,
 then just think of Ceph as a computer program that stores data and uses a
 network to make sure that there is a backup copy of the data.
 
+Components of Ceph
+==================
+
 Storage Interfaces
 ------------------
 
@@ -94,6 +97,89 @@ MDS
 A metadata server (MDS) is necessary for the proper functioning of CephFS.
 See :ref:`orchestrator-cli-cephfs` and :ref:`arch-cephfs`.
 
+Vstart Cluster Installation and Configuration Procedure
+=======================================================
+
+#. Clone the ``ceph/ceph`` repository:
+
+   .. prompt:: bash #
+
+      git clone git@github.com:ceph/ceph
+
+#. Update the submodules in the ``ceph/ceph`` repository:
+
+   .. prompt:: bash #
+    
+      git submodule update --init --recursive --progress
+
+#. Run ``install-deps.sh`` from within the directory into which you cloned the
+   ``ceph/ceph`` repository:
+
+   .. prompt:: bash #
+
+      ./install-deps.sh
+
+#. Install the ``python3-routes`` package:
+
+   .. prompt:: bash #
+
+      apt install python3-routes
+
+#. Move into the ``ceph`` directory. You will know that you are in the correct
+   directory if it contains the file ``do_cmake.sh``:
+
+   .. prompt:: bash #
+
+      cd ceph
+
+#. Run the ``do_cmake.sh`` script:
+
+   .. prompt:: bash #
+
+      ./do_cmake.sh
+
+#. The ``do_cmake.sh`` script creates a ``build/`` directory. Move into the
+   ``build/`` directory:
+
+   .. prompt:: bash #
+
+      cd build
+
+#. Use ``ninja`` to build the development environment:
+
+   .. prompt:: bash #
+
+      ninja -j3
+
+   .. note:: This step takes a long time to run. The ``ninja -j3`` command
+      kicks off a process consisting of 2289 steps. This step took over three
+      hours when I ran it on an Intel NUC with an i7 in September of 2024.
+
+#. Install the Ceph development environment:
+
+   .. prompt:: bash #
+
+      ninja install
+
+   This step does not take as long as the previous step.
+
+#. Build the vstart cluster:
+
+   .. prompt:: bash #
+
+      ninja vstart
+
+#. Start the vstart cluster:
+
+   .. prompt:: bash #
+      
+      ../src/vstart.sh --debug --new -x --localhost --bluestore
+
+   .. note:: Run this command from within the ``ceph/build`` directory.
+
+
+
+
 LINKS
 -----
 
diff --git a/make-dist b/make-dist
index e874436a5e7..033bedebd87 100755
--- a/make-dist
+++ b/make-dist
@@ -23,7 +23,7 @@ version=$1
 [ -z "$version" ] && version=$(git describe --long --match 'v*' | sed 's/^v//')
 if expr index $version '-' > /dev/null; then
     rpm_version=$(echo $version | cut -d - -f 1-1)
-    rpm_release=$(echo $version | cut -d - -f 2- | sed 's/-/./')
+    rpm_release=$(echo $version | cut -d - -f 2- | sed 's/-/./g')
 else
     rpm_version=$version
     rpm_release=0
diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
index fa2899b22c1..cde1a736f8c 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet
+++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet
@@ -45,8 +45,8 @@
           'for': '30s',
           expr: |||
             (
-              (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
-                count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
+              (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (
+                count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1)
               )
             ) == 1
           |||,
@@ -54,22 +54,20 @@
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down',
             summary: 'Monitor quorum is at risk%(cluster)s' % $.MultiClusterSummary(),
-            description: '{{ $min := query "floor(count(ceph_mon_metadata) / 2) + 1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
+            description: '{{ $min := printf "floor(count(ceph_mon_metadata{cluster=\'%s\'}) / 2) + 1" .Labels.cluster | query | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=\'%s\'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
           alert: 'CephMonDown',
           'for': '30s',
           expr: |||
-            count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
+            (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
           |||,
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down',
             summary: 'One or more monitors down%(cluster)s' % $.MultiClusterSummary(),
-            description: |||
-              {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
-            |||,
+            description: '{{ $down := printf "count(ceph_mon_quorum_status{cluster=\'%s\'} == 0)" .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=\'%s\'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
@@ -112,11 +110,11 @@
       rules: [
         {
           alert: 'CephOSDDownHigh',
-          expr: 'count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10',
+          expr: 'count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.1' },
           annotations: {
             summary: 'More than 10%% of OSDs are down%(cluster)s' % $.MultiClusterSummary(),
-            description: '{{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
+            description: '{{ $value | humanize }}% or {{ with printf "count (ceph_osd_up{cluster=\'%s\'} == 0)" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf "count (ceph_osd_up{cluster=\'%s\'})" .Labels.cluster | query }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=\'%s\'} * on(cluster, ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
@@ -126,7 +124,7 @@
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.8' },
           annotations: {
             summary: 'An OSD host is offline%(cluster)s' % $.MultiClusterSummary(),
-            description: 'The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}',
+            description: 'The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=\'%s\'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}',
           },
         },
         {
@@ -137,9 +135,7 @@
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down',
             summary: 'An OSD has been marked down%(cluster)s' % $.MultiClusterSummary(),
-            description: |||
-              {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
-            |||,
+            description: '{{ $num := printf "count(ceph_osd_up{cluster=\'%s\'} == 0) " .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range printf "(ceph_osd_up{cluster=\'%s\'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
           },
         },
         {
@@ -235,7 +231,7 @@
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.7' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany',
-            summary: 'Too many devices are predicted to fail, unable to resolve%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Too many devices are predicted to fail%(cluster)s, unable to resolve' % $.MultiClusterSummary(),
             description: 'The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated.',
           },
         },
@@ -298,7 +294,7 @@
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.1' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages',
-            summary: 'CephFS filesystem is damaged%(cluster)s.' % $.MultiClusterSummary(),
+            summary: 'CephFS filesystem is damaged%(cluster)s' % $.MultiClusterSummary(),
             description: 'Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support.',
           },
         },
@@ -390,7 +386,7 @@
           expr: 'up{job="ceph"} == 0',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.6.2' },
           annotations: {
-            summary: 'The mgr/prometheus module is not available%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The mgr/prometheus module is not available',
             description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'.",
           },
         },
@@ -507,7 +503,7 @@
           expr: 'node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.1' },
           annotations: {
-            summary: 'Root filesystem is dangerously full%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Root filesystem is dangerously full',
             description: 'Root volume is dangerously full: {{ $value | humanize }}% free.',
           },
         },
@@ -527,7 +523,7 @@
           ||| % $._config,
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
           annotations: {
-            summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'One or more NICs reports packet drops',
             description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
           },
         },
@@ -564,7 +560,7 @@
         },
         {
           alert: 'CephNodeDiskspaceWarning',
-          expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0',
+          expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * on(cluster, instance) group_left(nodename) node_uname_info < 0',
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.4' },
           annotations: {
             summary: 'Host filesystem free space is getting low%(cluster)s' % $.MultiClusterSummary(),
@@ -573,7 +569,7 @@
         },
         {
           alert: 'CephNodeInconsistentMTU',
-          expr: 'node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )',
+          expr: 'node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
             summary: 'MTU settings across Ceph hosts are inconsistent%(cluster)s' % $.MultiClusterSummary(),
@@ -611,7 +607,7 @@
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full',
             summary: 'Pool is full - writes are blocked%(cluster)s' % $.MultiClusterSummary(),
-            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)",
+            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range printf \"topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} * on(cluster,pool_id) group_right ceph_pool_metadata))\" .Labels.cluster | query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)",
           },
         },
         {
@@ -647,7 +643,7 @@
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
-            summary: '{{ $labels.ceph_daemon }} operations are slow to complete',
+            summary: '{{ $labels.ceph_daemon }} operations are slow to complete%(cluster)s' % $.MultiClusterSummary(),
             description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)',
           },
         },
@@ -763,7 +759,7 @@
           expr: 'absent(up{job="ceph"})',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.12.1' },
           annotations: {
-            summary: 'The scrape job for Ceph is missing from Prometheus%(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The scrape job for Ceph is missing from Prometheus',
             description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.",
           },
         },
@@ -775,7 +771,7 @@
         {
           alert: 'CephObjectMissing',
           'for': '30s',
-          expr: '(ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1',
+          expr: '(ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right(cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) == 1',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.1' },
           annotations: {
             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound',
@@ -807,31 +803,31 @@
         {
           alert: 'CephRBDMirrorImagesPerDaemonHigh',
           'for': '1m',
-          expr: 'sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+          expr: 'sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.2' },
           annotations: {
-            summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
-            description: 'Number of image replications per daemon is not suppossed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
+            summary: 'Number of image replications are now above %(CephRBDMirrorImagesPerDaemonThreshold)s%(cluster)s' % [$._config.CephRBDMirrorImagesPerDaemonThreshold, $.MultiClusterSummary()],
+            description: 'Number of image replications per daemon is not supposed to go beyond threshold %(CephRBDMirrorImagesPerDaemonThreshold)s' % $._config,
           },
         },
         {
           alert: 'CephRBDMirrorImagesNotInSync',
           'for': '1m',
-          expr: 'sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0',
+          expr: 'sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.3' },
           annotations: {
-            summary: 'Some of the RBD mirror images are not in sync with the remote counter parts.',
+            summary: 'Some of the RBD mirror images are not in sync with the remote counter parts%(cluster)s' % $.MultiClusterSummary(),
             description: 'Both local and remote RBD mirror images should be in sync.',
           },
         },
         {
           alert: 'CephRBDMirrorImagesNotInSyncVeryHigh',
           'for': '1m',
-          expr: 'count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)',
+          expr: 'count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1)',
           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.4' },
           annotations: {
-            summary: 'Number of unsynchronized images are very high.',
-            description: 'More than 10% of the images have synchronization problems',
+            summary: 'Number of unsynchronized images are very high%(cluster)s' % $.MultiClusterSummary(),
+            description: 'More than 10% of the images have synchronization problems.',
           },
         },
         {
@@ -840,7 +836,7 @@
           expr: 'rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > %.2f' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold],
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.5' },
           annotations: {
-            summary: 'The replication network usage has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
+            summary: 'The replication network usage%(cluster)s has been increased over %d%s in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes' % [$.MultiClusterSummary(), $._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
             description: 'Detected a heavy increase in bandwidth for rbd replications (over %d%s) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously' % [$._config.CephRBDMirrorImageTransferBandwidthThreshold * 100, '%'],
           },
         },
@@ -852,50 +848,50 @@
         {
           alert: 'NVMeoFSubsystemNamespaceLimit',
           'for': '1m',
-          expr: '(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit',
+          expr: '(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces %(cluster)s' % $.MultiClusterSummary(),
+            summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces%(cluster)s' % $.MultiClusterSummary(),
             description: 'Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}',
           },
         },
         {
           alert: 'NVMeoFTooManyGateways',
           'for': '1m',
-          expr: 'count(ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster],
+          expr: 'count(ceph_nvmeof_gateway_info) by (cluster) > %.2f' % [$._config.NVMeoFMaxGatewaysPerCluster],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Max supported gateways exceeded %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Max supported gateways exceeded%(cluster)s' % $.MultiClusterSummary(),
             description: 'You may create many gateways, but %(NVMeoFMaxGatewaysPerCluster)d is the tested limit' % $._config,
           },
         },
         {
           alert: 'NVMeoFMaxGatewayGroupSize',
           'for': '1m',
-          expr: 'count by(group) (ceph_nvmeof_gateway_info) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup],
+          expr: 'count(ceph_nvmeof_gateway_info) by (cluster,group) > %.2f' % [$._config.NVMeoFMaxGatewaysPerGroup],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Max gateways within a gateway group ({{ $labels.group }}) exceeded%(cluster)s' % $.MultiClusterSummary(),
             description: 'You may create many gateways in a gateway group, but %(NVMeoFMaxGatewaysPerGroup)d is the tested limit' % $._config,
           },
         },
         {
           alert: 'NVMeoFSingleGatewayGroup',
           'for': '5m',
-          expr: 'count by(group) (ceph_nvmeof_gateway_info) == 1',
+          expr: 'count(ceph_nvmeof_gateway_info) by(cluster,group) == 1',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible%(cluster)s' % $.MultiClusterSummary(),
             description: 'Although a single member gateway group is valid, it should only be used for test purposes',
           },
         },
         {
           alert: 'NVMeoFHighGatewayCPU',
           'for': '10m',
-          expr: 'label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU],
+          expr: 'label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > %.2f' % [$._config.NVMeoFHighGatewayCPU],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'CPU used by {{ $labels.instance }} NVMe-oF Gateway is high%(cluster)s' % $.MultiClusterSummary(),
             description: 'Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores',
           },
         },
@@ -905,27 +901,27 @@
           expr: 'ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Subsystem {{ $labels.nqn }} has been defined without host level security%(cluster)s' % $.MultiClusterSummary(),
             description: 'It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss',
           },
         },
         {
           alert: 'NVMeoFTooManySubsystems',
           'for': '1m',
-          expr: 'count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway],
+          expr: 'count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > %.2f' % [$._config.NVMeoFMaxSubsystemsPerGateway],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The number of subsystems defined to the gateway exceeds supported values %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The number of subsystems defined to the gateway exceeds supported values%(cluster)s' % $.MultiClusterSummary(),
             description: 'Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported',
           },
         },
         {
           alert: 'NVMeoFVersionMismatch',
           'for': '1h',
-          expr: 'count(count by(version) (ceph_nvmeof_gateway_info)) > 1',
+          expr: 'count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The cluster has different NVMe-oF gateway releases active %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Too many different NVMe-oF gateway releases active%(cluster)s' % $.MultiClusterSummary(),
             description: 'This may indicate an issue with deployment. Check cephadm logs',
           },
         },
@@ -935,17 +931,17 @@
           expr: 'ceph_nvmeof_subsystem_host_count > %.2f' % [$._config.NVMeoFHighClientCount],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The number of clients connected to {{ $labels.nqn }} is too high %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The number of clients connected to {{ $labels.nqn }} is too high%(cluster)s' % $.MultiClusterSummary(),
             description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config,
           },
         },
         {
           alert: 'NVMeoFHighHostCPU',
           'for': '10m',
-          expr: '100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU],
+          expr: '100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= %.2f' % [$._config.NVMeoFHighHostCPU],
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }}) %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'The CPU is high ({{ $value }}%%) on NVMeoF Gateway host ({{ $labels.host }})%(cluster)s' % $.MultiClusterSummary(),
             description: 'High CPU on a gateway host can lead to CPU contention and performance degradation',
           },
         },
@@ -955,7 +951,7 @@
           expr: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}',
           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.14.1' },
           annotations: {
-            summary: 'Network interface {{ $labels.device }} is down %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Network interface {{ $labels.device }} is down%(cluster)s' % $.MultiClusterSummary(),
             description: 'A NIC used by one or more subsystems is in a down state',
           },
         },
@@ -965,7 +961,7 @@
           expr: 'ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}',
           labels: { severity: 'warning', type: 'ceph_default' },
           annotations: {
-            summary: 'Network interface {{ $labels.device }} is not running in full duplex mode %(cluster)s' % $.MultiClusterSummary(),
+            summary: 'Network interface {{ $labels.device }} is not running in full duplex mode%(cluster)s' % $.MultiClusterSummary(),
             description: 'Until this is resolved, performance from the gateway will be degraded',
           },
         },
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 84452e5845a..ba6a6ded0a3 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -3,8 +3,8 @@ groups:
     rules:
       - alert: "CephHealthError"
         annotations:
-          description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information."
-          summary: "Ceph is in the ERROR state"
+          description: "The cluster state has been HEALTH_ERROR for more than 5 minutes on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more information."
+          summary: "Ceph is in the ERROR state on cluster {{ $labels.cluster }}"
         expr: "ceph_health_status == 2"
         for: "5m"
         labels:
@@ -13,8 +13,8 @@ groups:
           type: "ceph_default"
       - alert: "CephHealthWarning"
         annotations:
-          description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information."
-          summary: "Ceph is in the WARNING state"
+          description: "The cluster state has been HEALTH_WARN for more than 15 minutes on cluster {{ $labels.cluster }}. Please check 'ceph health detail' for more information."
+          summary: "Ceph is in the WARNING state on cluster {{ $labels.cluster }}"
         expr: "ceph_health_status == 1"
         for: "15m"
         labels:
@@ -24,13 +24,13 @@ groups:
     rules:
       - alert: "CephMonDownQuorumAtRisk"
         annotations:
-          description: "{{ $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+          description: "{{ $min := printf \"floor(count(ceph_mon_metadata{cluster='%s'}) / 2) + 1\" .Labels.cluster | query | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range printf \"(ceph_mon_quorum_status{cluster='%s'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
-          summary: "Monitor quorum is at risk"
+          summary: "Monitor quorum is at risk on cluster {{ $labels.cluster }}"
         expr: |
           (
-            (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
-              count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
+            (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (
+              count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1)
             )
           ) == 1
         for: "30s"
@@ -40,12 +40,11 @@ groups:
           type: "ceph_default"
       - alert: "CephMonDown"
         annotations:
-          description: |
-            {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
+          description: "{{ $down := printf \"count(ceph_mon_quorum_status{cluster='%s'} == 0)\" .Labels.cluster | query | first | value }}{{ $s := \"\" }}{{ if gt $down 1.0 }}{{ $s = \"s\" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range printf \"(ceph_mon_quorum_status{cluster='%s'} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
-          summary: "One or more monitors down"
+          summary: "One or more monitors down on cluster {{ $labels.cluster }}"
         expr: |
-          count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
+          (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
         for: "30s"
         labels:
           severity: "warning"
@@ -54,7 +53,7 @@ groups:
         annotations:
           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit"
-          summary: "Filesystem space on at least one monitor is critically low"
+          summary: "Filesystem space on at least one monitor is critically low on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1"
         for: "1m"
         labels:
@@ -65,7 +64,7 @@ groups:
         annotations:
           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low"
-          summary: "Drive space on at least one monitor is approaching full"
+          summary: "Drive space on at least one monitor is approaching full on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1"
         for: "5m"
         labels:
@@ -75,7 +74,7 @@ groups:
         annotations:
           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew"
-          summary: "Clock skew detected among monitors"
+          summary: "Clock skew detected among monitors on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1"
         for: "1m"
         labels:
@@ -85,17 +84,17 @@ groups:
     rules:
       - alert: "CephOSDDownHigh"
         annotations:
-          description: "{{ $value | humanize }}% or {{ with query \"count(ceph_osd_up == 0)\" }}{{ . | first | value }}{{ end }} of {{ with query \"count(ceph_osd_up)\" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
-          summary: "More than 10% of OSDs are down"
-        expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10"
+          description: "{{ $value | humanize }}% or {{ with printf \"count (ceph_osd_up{cluster='%s'} == 0)\" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf \"count (ceph_osd_up{cluster='%s'})\" .Labels.cluster | query }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster, ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
+          summary: "More than 10% of OSDs are down on cluster {{ $labels.cluster }}"
+        expr: "count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.1"
           severity: "critical"
           type: "ceph_default"
       - alert: "CephOSDHostDown"
         annotations:
-          description: "The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
-          summary: "An OSD host is offline"
+          description: "The following OSDs are down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
+          summary: "An OSD host is offline on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1"
         for: "5m"
         labels:
@@ -104,10 +103,9 @@ groups:
           type: "ceph_default"
       - alert: "CephOSDDown"
         annotations:
-          description: |
-            {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
+          description: "{{ $num := printf \"count(ceph_osd_up{cluster='%s'} == 0) \" .Labels.cluster | query | first | value }}{{ $s := \"\" }}{{ if gt $num 1.0 }}{{ $s = \"s\" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s \"\" }}is{{ else }}are{{ end }} down: {{- range printf \"(ceph_osd_up{cluster='%s'} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down"
-          summary: "An OSD has been marked down"
+          summary: "An OSD has been marked down on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1"
         for: "5m"
         labels:
@@ -118,7 +116,7 @@ groups:
         annotations:
           description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull"
-          summary: "OSD(s) running low on free space (NEARFULL)"
+          summary: "OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1"
         for: "5m"
         labels:
@@ -129,7 +127,7 @@ groups:
         annotations:
           description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full"
-          summary: "OSD full, writes blocked"
+          summary: "OSD full, writes blocked on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0"
         for: "1m"
         labels:
@@ -140,7 +138,7 @@ groups:
         annotations:
           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull"
-          summary: "OSD(s) too full for backfill operations"
+          summary: "OSD(s) too full for backfill operations on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0"
         for: "1m"
         labels:
@@ -150,7 +148,7 @@ groups:
         annotations:
           description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs"
-          summary: "OSD reports a high number of read errors"
+          summary: "OSD reports a high number of read errors on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1"
         for: "30s"
         labels:
@@ -159,7 +157,7 @@ groups:
       - alert: "CephOSDTimeoutsPublicNetwork"
         annotations:
           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
-          summary: "Network issues delaying OSD heartbeats (public network)"
+          summary: "Network issues delaying OSD heartbeats (public network) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1"
         for: "1m"
         labels:
@@ -168,7 +166,7 @@ groups:
       - alert: "CephOSDTimeoutsClusterNetwork"
         annotations:
           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
-          summary: "Network issues delaying OSD heartbeats (cluster network)"
+          summary: "Network issues delaying OSD heartbeats (cluster network) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1"
         for: "1m"
         labels:
@@ -178,7 +176,7 @@ groups:
         annotations:
           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch"
-          summary: "OSD size inconsistency error"
+          summary: "OSD size inconsistency error on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1"
         for: "1m"
         labels:
@@ -188,7 +186,7 @@ groups:
         annotations:
           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2"
-          summary: "Device(s) predicted to fail soon"
+          summary: "Device(s) predicted to fail soon on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1"
         for: "1m"
         labels:
@@ -198,7 +196,7 @@ groups:
         annotations:
           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
-          summary: "Too many devices are predicted to fail, unable to resolve"
+          summary: "Too many devices are predicted to fail on cluster {{ $labels.cluster }}, unable to resolve"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
         for: "1m"
         labels:
@@ -209,7 +207,7 @@ groups:
         annotations:
           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use"
-          summary: "Device failure is predicted, but unable to relocate data"
+          summary: "Device failure is predicted, but unable to relocate data on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1"
         for: "1m"
         labels:
@@ -219,8 +217,8 @@ groups:
         annotations:
           description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
           documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds"
-          summary: "Network issues are causing OSDs to flap (mark each other down)"
-        expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
+          summary: "Network issues are causing OSDs to flap (mark each other down) on cluster {{ $labels.cluster }}"
+        expr: "(rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.4"
           severity: "warning"
@@ -229,7 +227,7 @@ groups:
         annotations:
           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors"
-          summary: "Device read errors detected"
+          summary: "Device read errors detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1"
         for: "30s"
         labels:
@@ -238,12 +236,12 @@ groups:
       - alert: "CephPGImbalance"
         annotations:
           description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
-          summary: "PGs are not balanced across OSDs"
+          summary: "PGs are not balanced across OSDs on cluster {{ $labels.cluster }}"
         expr: |
           abs(
-            ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) /
+            ((ceph_osd_numpg > 0) - on (cluster,job) group_left avg(ceph_osd_numpg > 0) by (cluster,job)) /
             on (job) group_left avg(ceph_osd_numpg > 0) by (job)
-          ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
+          ) * on (cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
         for: "5m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.4.5"
@@ -255,7 +253,7 @@ groups:
         annotations:
           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
-          summary: "CephFS filesystem is damaged."
+          summary: "CephFS filesystem is damaged on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0"
         for: "1m"
         labels:
@@ -266,7 +264,7 @@ groups:
         annotations:
           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down"
-          summary: "CephFS filesystem is offline"
+          summary: "CephFS filesystem is offline on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0"
         for: "1m"
         labels:
@@ -277,7 +275,7 @@ groups:
         annotations:
           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded"
-          summary: "CephFS filesystem is degraded"
+          summary: "CephFS filesystem is degraded on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0"
         for: "1m"
         labels:
@@ -288,7 +286,7 @@ groups:
         annotations:
           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max"
-          summary: "Ceph MDS daemon count is lower than configured"
+          summary: "Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0"
         for: "1m"
         labels:
@@ -298,7 +296,7 @@ groups:
         annotations:
           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby"
-          summary: "Ceph filesystem standby daemons too few"
+          summary: "Ceph filesystem standby daemons too few on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0"
         for: "1m"
         labels:
@@ -308,7 +306,7 @@ groups:
         annotations:
           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds"
-          summary: "MDS daemon failed, no further standby available"
+          summary: "MDS daemon failed, no further standby available on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0"
         for: "1m"
         labels:
@@ -319,7 +317,7 @@ groups:
         annotations:
           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
-          summary: "CephFS filesystem in read only mode due to write error(s)"
+          summary: "CephFS filesystem in read only mode due to write error(s) on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0"
         for: "1m"
         labels:
@@ -332,7 +330,7 @@ groups:
         annotations:
           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash"
-          summary: "A manager module has recently crashed"
+          summary: "A manager module has recently crashed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1"
         for: "5m"
         labels:
@@ -354,8 +352,8 @@ groups:
       - alert: "CephPGsInactive"
         annotations:
           description: "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests."
-          summary: "One or more placement groups are inactive"
-        expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
+          summary: "One or more placement groups are inactive on cluster {{ $labels.cluster }}"
+        expr: "ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
         for: "5m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.7.1"
@@ -364,8 +362,8 @@ groups:
       - alert: "CephPGsUnclean"
         annotations:
           description: "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure."
-          summary: "One or more placement groups are marked unclean"
-        expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
+          summary: "One or more placement groups are marked unclean on cluster {{ $labels.cluster }}"
+        expr: "ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
         for: "15m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.7.2"
@@ -375,7 +373,7 @@ groups:
         annotations:
           description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged"
-          summary: "Placement group damaged, manual intervention needed"
+          summary: "Placement group damaged, manual intervention needed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1"
         for: "5m"
         labels:
@@ -386,7 +384,7 @@ groups:
         annotations:
           description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full"
-          summary: "OSDs are too full for recovery"
+          summary: "OSDs are too full for recovery on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1"
         for: "1m"
         labels:
@@ -397,7 +395,7 @@ groups:
         annotations:
           description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
-          summary: "PG is unavailable, blocking I/O"
+          summary: "PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O"
         expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1"
         for: "1m"
         labels:
@@ -408,7 +406,7 @@ groups:
         annotations:
           description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full"
-          summary: "Backfill operations are blocked due to lack of free space"
+          summary: "Backfill operations are blocked due to lack of free space on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1"
         for: "1m"
         labels:
@@ -419,7 +417,7 @@ groups:
         annotations:
           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed"
-          summary: "Placement group(s) have not been scrubbed"
+          summary: "Placement group(s) have not been scrubbed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1"
         for: "5m"
         labels:
@@ -429,7 +427,7 @@ groups:
         annotations:
           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs"
-          summary: "Placement groups per OSD is too high"
+          summary: "Placement groups per OSD is too high on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1"
         for: "1m"
         labels:
@@ -439,7 +437,7 @@ groups:
         annotations:
           description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed"
-          summary: "Placement group(s) have not been deep scrubbed"
+          summary: "Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1"
         for: "5m"
         labels:
@@ -479,7 +477,7 @@ groups:
       - alert: "CephNodeNetworkPacketErrors"
         annotations:
           description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
-          summary: "One or more NICs reports packet errors"
+          summary: "One or more NICs reports packet errors on cluster {{ $labels.cluster }}"
         expr: |
           (
             rate(node_network_receive_errs_total{device!="lo"}[1m]) +
@@ -498,7 +496,7 @@ groups:
       - alert: "CephNodeNetworkBondDegraded"
         annotations:
           description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
-          summary: "Degraded Bond on Node {{ $labels.instance }}"
+          summary: "Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster }}"
         expr: |
           node_bonding_slaves - node_bonding_active != 0
         labels:
@@ -507,8 +505,8 @@ groups:
       - alert: "CephNodeDiskspaceWarning"
         annotations:
           description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
-          summary: "Host filesystem free space is getting low"
-        expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
+          summary: "Host filesystem free space is getting low on cluster {{ $labels.cluster }}"
+        expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) * on(cluster, instance) group_left(nodename) node_uname_info < 0"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
           severity: "warning"
@@ -516,8 +514,8 @@ groups:
       - alert: "CephNodeInconsistentMTU"
         annotations:
           description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
-          summary: "MTU settings across Ceph hosts are inconsistent"
-        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
+          summary: "MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }}"
+        expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
         labels:
           severity: "warning"
           type: "ceph_default"
@@ -526,8 +524,8 @@ groups:
       - alert: "CephPoolGrowthWarning"
         annotations:
           description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
-          summary: "Pool growth rate may soon exceed capacity"
-        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
+          summary: "Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster }}"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id, instance) group_right() ceph_pool_metadata) >= 95"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
           severity: "warning"
@@ -535,16 +533,16 @@ groups:
       - alert: "CephPoolBackfillFull"
         annotations:
           description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
-          summary: "Free space in a pool is too low for recovery/backfill"
+          summary: "Free space in a pool is too low for recovery/backfill on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "CephPoolFull"
         annotations:
-          description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
+          description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range printf \"topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} * on(cluster,pool_id) group_right ceph_pool_metadata))\" .Labels.cluster | query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full"
-          summary: "Pool is full - writes are blocked"
+          summary: "Pool is full - writes are blocked on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0"
         for: "1m"
         labels:
@@ -554,7 +552,7 @@ groups:
       - alert: "CephPoolNearFull"
         annotations:
           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
-          summary: "One or more Ceph pools are nearly full"
+          summary: "One or more Ceph pools are nearly full on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0"
         for: "5m"
         labels:
@@ -566,7 +564,7 @@ groups:
         annotations:
           description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
-          summary: "OSD operations are slow to complete"
+          summary: "OSD operations are slow to complete on cluster {{ $labels.cluster }}"
         expr: "ceph_healthcheck_slow_ops > 0"
         for: "30s"
         labels:
@@ -576,7 +574,7 @@ groups:
         annotations:
           description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
-          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete on cluster {{ $labels.cluster }}"
         expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
         for: "30s"
         labels:
@@ -587,7 +585,7 @@ groups:
       - alert: "CephadmUpgradeFailed"
         annotations:
           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
-          summary: "Ceph version upgrade has failed"
+          summary: "Ceph version upgrade has failed on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0"
         for: "30s"
         labels:
@@ -597,7 +595,7 @@ groups:
       - alert: "CephadmDaemonFailed"
         annotations:
           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
-          summary: "A ceph daemon managed by cephadm is down"
+          summary: "A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0"
         for: "30s"
         labels:
@@ -608,7 +606,7 @@ groups:
         annotations:
           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
           documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused"
-          summary: "Orchestration tasks via cephadm are PAUSED"
+          summary: "Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0"
         for: "1m"
         labels:
@@ -619,7 +617,7 @@ groups:
       - alert: "HardwareStorageError"
         annotations:
           description: "Some storage devices are in error. Check `ceph health detail`."
-          summary: "Storage devices error(s) detected"
+          summary: "Storage devices error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
         for: "30s"
         labels:
@@ -629,7 +627,7 @@ groups:
       - alert: "HardwareMemoryError"
         annotations:
           description: "DIMM error(s) detected. Check `ceph health detail`."
-          summary: "DIMM error(s) detected"
+          summary: "DIMM error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
         for: "30s"
         labels:
@@ -639,7 +637,7 @@ groups:
       - alert: "HardwareProcessorError"
         annotations:
           description: "Processor error(s) detected. Check `ceph health detail`."
-          summary: "Processor error(s) detected"
+          summary: "Processor error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
         for: "30s"
         labels:
@@ -649,7 +647,7 @@ groups:
       - alert: "HardwareNetworkError"
         annotations:
           description: "Network error(s) detected. Check `ceph health detail`."
-          summary: "Network error(s) detected"
+          summary: "Network error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
         for: "30s"
         labels:
@@ -659,7 +657,7 @@ groups:
       - alert: "HardwarePowerError"
         annotations:
           description: "Power supply error(s) detected. Check `ceph health detail`."
-          summary: "Power supply error(s) detected"
+          summary: "Power supply error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
         for: "30s"
         labels:
@@ -669,7 +667,7 @@ groups:
       - alert: "HardwareFanError"
         annotations:
           description: "Fan error(s) detected. Check `ceph health detail`."
-          summary: "Fan error(s) detected"
+          summary: "Fan error(s) detected on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
         for: "30s"
         labels:
@@ -694,8 +692,8 @@ groups:
         annotations:
           description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound"
-          summary: "Object(s) marked UNFOUND"
-        expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1"
+          summary: "Object(s) marked UNFOUND on cluster {{ $labels.cluster }}"
+        expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() group_right(cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) == 1"
         for: "30s"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.1"
@@ -707,7 +705,7 @@ groups:
         annotations:
           description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash"
-          summary: "One or more Ceph daemons have crashed, and are pending acknowledgement"
+          summary: "One or more Ceph daemons have crashed, and are pending acknowledgement on cluster {{ $labels.cluster }}"
         expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1"
         for: "1m"
         labels:
@@ -718,9 +716,9 @@ groups:
     rules:
       - alert: "CephRBDMirrorImagesPerDaemonHigh"
         annotations:
-          description: "Number of image replications per daemon is not suppossed to go beyond threshold 100"
-          summary: "Number of image replications are now above 100"
-        expr: "sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
+          description: "Number of image replications per daemon is not supposed to go beyond threshold 100"
+          summary: "Number of image replications are now above 100 on cluster {{ $labels.cluster }}"
+        expr: "sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
         for: "1m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
@@ -729,8 +727,8 @@ groups:
       - alert: "CephRBDMirrorImagesNotInSync"
         annotations:
           description: "Both local and remote RBD mirror images should be in sync."
-          summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
-        expr: "sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
+          summary: "Some of the RBD mirror images are not in sync with the remote counter parts on cluster {{ $labels.cluster }}"
+        expr: "sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
         for: "1m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
@@ -738,9 +736,9 @@ groups:
           type: "ceph_default"
       - alert: "CephRBDMirrorImagesNotInSyncVeryHigh"
         annotations:
-          description: "More than 10% of the images have synchronization problems"
-          summary: "Number of unsynchronized images are very high."
-        expr: "count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
+          description: "More than 10% of the images have synchronization problems."
+          summary: "Number of unsynchronized images are very high on cluster {{ $labels.cluster }}"
+        expr: "count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
         for: "1m"
         labels:
           oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
@@ -749,7 +747,7 @@ groups:
       - alert: "CephRBDMirrorImageTransferBandwidthHigh"
         annotations:
           description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
-          summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+          summary: "The replication network usage on cluster {{ $labels.cluster }} has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
         expr: "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80"
         for: "1m"
         labels:
@@ -761,8 +759,8 @@ groups:
       - alert: "NVMeoFSubsystemNamespaceLimit"
         annotations:
           description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
-          summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces "
-        expr: "(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
+          summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces on cluster {{ $labels.cluster }}"
+        expr: "(count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
         for: "1m"
         labels:
           severity: "warning"
@@ -770,17 +768,17 @@ groups:
       - alert: "NVMeoFTooManyGateways"
         annotations:
           description: "You may create many gateways, but 4 is the tested limit"
-          summary: "Max supported gateways exceeded "
-        expr: "count(ceph_nvmeof_gateway_info) > 4.00"
+          summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
         for: "1m"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "NVMeoFMaxGatewayGroupSize"
         annotations:
-          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
-          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded "
-        expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00"
+          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -788,8 +786,8 @@ groups:
       - alert: "NVMeoFSingleGatewayGroup"
         annotations:
           description: "Although a single member gateway group is valid, it should only be used for test purposes"
-          summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible "
-        expr: "count by(group) (ceph_nvmeof_gateway_info) == 1"
+          summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible on cluster {{ $labels.cluster }}"
+        expr: "count(ceph_nvmeof_gateway_info) by(cluster,group) == 1"
         for: "5m"
         labels:
           severity: "warning"
@@ -797,8 +795,8 @@ groups:
       - alert: "NVMeoFHighGatewayCPU"
         annotations:
           description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
-          summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high "
-        expr: "label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
+          summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster {{ $labels.cluster }}"
+        expr: "label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
         for: "10m"
         labels:
           severity: "warning"
@@ -806,7 +804,7 @@ groups:
       - alert: "NVMeoFGatewayOpenSecurity"
         annotations:
           description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
-          summary: "Subsystem {{ $labels.nqn }} has been defined without host level security "
+          summary: "Subsystem {{ $labels.nqn }} has been defined without host level security on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}"
         for: "5m"
         labels:
@@ -815,8 +813,8 @@ groups:
       - alert: "NVMeoFTooManySubsystems"
         annotations:
           description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
-          summary: "The number of subsystems defined to the gateway exceeds supported values "
-        expr: "count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
+          summary: "The number of subsystems defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
+        expr: "count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -824,8 +822,8 @@ groups:
       - alert: "NVMeoFVersionMismatch"
         annotations:
           description: "This may indicate an issue with deployment. Check cephadm logs"
-          summary: "The cluster has different NVMe-oF gateway releases active "
-        expr: "count(count by(version) (ceph_nvmeof_gateway_info)) > 1"
+          summary: "Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster }}"
+        expr: "count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1"
         for: "1h"
         labels:
           severity: "warning"
@@ -833,7 +831,7 @@ groups:
       - alert: "NVMeoFHighClientCount"
         annotations:
           description: "The supported limit for clients connecting to a subsystem is 32"
-          summary: "The number of clients connected to {{ $labels.nqn }} is too high "
+          summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_host_count > 32.00"
         for: "1m"
         labels:
@@ -842,8 +840,8 @@ groups:
       - alert: "NVMeoFHighHostCPU"
         annotations:
           description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
-          summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) "
-        expr: "100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
+          summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) on cluster {{ $labels.cluster }}"
+        expr: "100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
         for: "10m"
         labels:
           severity: "warning"
@@ -851,7 +849,7 @@ groups:
       - alert: "NVMeoFInterfaceDown"
         annotations:
           description: "A NIC used by one or more subsystems is in a down state"
-          summary: "Network interface {{ $labels.device }} is down "
+          summary: "Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}"
         for: "30s"
         labels:
@@ -861,7 +859,7 @@ groups:
       - alert: "NVMeoFInterfaceDuplex"
         annotations:
           description: "Until this is resolved, performance from the gateway will be degraded"
-          summary: "Network interface {{ $labels.device }} is not running in full duplex mode "
+          summary: "Network interface {{ $labels.device }} is not running in full duplex mode on cluster {{ $labels.cluster }}"
         expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}"
         for: "30s"
         labels:
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index 40d6f4d0983..a269ff74227 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -5,13 +5,13 @@ tests:
  # health error
  - interval: 5m
    input_series:
-    - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '2 2 2 2 2 2 2'
    promql_expr_test:
     - expr: ceph_health_status == 2
       eval_time: 5m
       exp_samples:
-       - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+       - labels: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
          value: 2
    alert_rule_test:
     - eval_time: 1m
@@ -25,20 +25,21 @@ tests:
           oid: 1.3.6.1.4.1.50495.1.2.1.2.1
           type: ceph_default
           severity: critical
+          cluster: mycluster
         exp_annotations:
-          summary: Ceph is in the ERROR state
-          description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
+          summary: Ceph is in the ERROR state on cluster mycluster
+          description: The cluster state has been HEALTH_ERROR for more than 5 minutes on cluster mycluster. Please check 'ceph health detail' for more information.
 
  # health warning
  - interval: 5m
    input_series:
-    - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_status == 1
        eval_time: 15m
        exp_samples:
-         - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
+         - labels: 'ceph_health_status{instance="ceph:9283",job="ceph",cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 10m
@@ -51,45 +52,46 @@ tests:
           job: ceph
           type: ceph_default
           severity: warning
+          cluster: mycluster
         exp_annotations:
-          summary: Ceph is in the WARNING state
-          description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
+          summary: Ceph is in the WARNING state on cluster mycluster
+          description: The cluster state has been HEALTH_WARN for more than 15 minutes on cluster mycluster. Please check 'ceph health detail' for more information.
 
  # 10% OSDs down
  - interval: 1m
    input_series:
-    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '0 0 0 0 0'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",cluster="mycluster",objectstore="bluestore",
+        public_addr="172.20.0.2"}'
       values: '1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",cluster="mycluster",objectstore="bluestore",
+        public_addr="172.20.0.2"}'
       values: '1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",cluster="mycluster",objectstore="bluestore",
+        public_addr="172.20.0.2"}'
       values: '1 1 1 1 1'
    promql_expr_test:
-     - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
+     - expr: count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"    }'
            value: 3.333333333333333E+01
    alert_rule_test:
      - eval_time: 1m
@@ -99,39 +101,40 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.4.1
            type: ceph_default
            severity: critical
+           cluster: mycluster
          exp_annotations:
-           summary: More than 10% of OSDs are down
+           summary: More than 10% of OSDs are down on cluster mycluster
            description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
 
  # flapping OSD
  - interval: 1s
    input_series:
-    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+1x100'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+0x100'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+0x100'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+        public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+        public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+        ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
+        cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
+        public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
    promql_expr_test:
      - expr: |
@@ -142,7 +145,7 @@ tests:
        eval_time: 1m
        exp_samples:
          - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
-           job="ceph"}'
+            job="ceph",cluster="mycluster"}'
            value: 1.2200000000000001E+01
    alert_rule_test:
      - eval_time: 5m
@@ -155,54 +158,39 @@ tests:
            job: ceph
            oid: 1.3.6.1.4.1.50495.1.2.1.4.4
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
-           summary: Network issues are causing OSDs to flap (mark each other down)
+           summary: Network issues are causing OSDs to flap (mark each other down) on cluster mycluster
            description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
 
  # high pg count deviation
  - interval: 1m
    input_series:
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 320'
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
-      job="ceph"}'
+    - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '100 100 100 100 100 160'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
-      ceph_version="ceph version 17.0.0-189-g3558fd72
-      (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
-      cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
-      hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
-      public_addr="172.20.0.2"}'
+    - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",ceph_version="ceph version 17.0.0-189-g3558fd72
+        (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
+        hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",public_addr="172.20.0.2",cluster="mycluster"}'
       values: '1 1 1 1 1 1'
    promql_expr_test:
      - expr: |
@@ -215,8 +203,7 @@ tests:
 
        eval_time: 5m
        exp_samples:
-         - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
-           job="ceph"}'
+         - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",job="ceph",cluster="mycluster"}'
            value: 6E-01
    alert_rule_test:
      - eval_time: 10m
@@ -229,44 +216,40 @@ tests:
            job: ceph
            oid: 1.3.6.1.4.1.50495.1.2.1.4.5
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
-           summary: PGs are not balanced across OSDs
+           summary: PGs are not balanced across OSDs on cluster mycluster
            description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
 
  # pgs inactive
  - interval: 1m
    input_series:
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="1"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="2"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="2"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="3"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="3"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
       values: '32 32 32 32 32 32 32 32'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
       values: '33 32 32 32 32 33 33 32'
-    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
       values: '32 32 32 32 32 32 32 32'
-    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
+    - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
       values: '32 32 32 32 32 32 32 32'
    promql_expr_test:
      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
              (ceph_pg_total - ceph_pg_active) > 0
        eval_time: 5m
        exp_samples:
-         - labels: '{instance="ceph:9283", job="ceph",
-           name="device_health_metrics",
-           pool_id="3"}'
-           value: 1
+        - labels: '{instance="ceph:9283", job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="3"}'
+          value: 1
    alert_rule_test:
      - eval_time: 5m
        alertname: CephPGsInactive
@@ -278,46 +261,39 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.7.1
            pool_id: 3
            severity: critical
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
-           summary: One or more placement groups are inactive
+           summary: One or more placement groups are inactive on cluster mycluster
            description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
 
  #pgs unclean
  - interval: 1m
    input_series:
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="1"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="2"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="2"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="device_health_metrics",pool_id="3"}'
+    - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",cluster="mycluster",name="device_health_metrics",pool_id="3"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
-      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
-      32 32 32'
-    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
-      values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
-      33 33'
-    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32'
+    - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
+      values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="1"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
-      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
-      32 32'
-    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
-      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
-      32 32'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="2"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32'
+    - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",cluster="mycluster",pool_id="3"}'
+      values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32'
    promql_expr_test:
      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
              (ceph_pg_total - ceph_pg_clean) > 0
        eval_time: 15m
        exp_samples:
-         - labels: '{instance="ceph:9283", job="ceph",
-           name="device_health_metrics", pool_id="3"}'
+         - labels: '{instance="ceph:9283", job="ceph",cluster="mycluster",name="device_health_metrics", pool_id="3"}'
            value: 1
    alert_rule_test:
      - eval_time: 16m
@@ -330,32 +306,33 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.7.2
            pool_id: 3
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
-           summary: One or more placement groups are marked unclean
+           summary: One or more placement groups are marked unclean on cluster mycluster
            description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
 
  # root volume full
  - interval: 1m
    input_series:
     - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
-      --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
-      mountpoint="/"}'
+        --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+        mountpoint="/"}'
       values: '35336400896 35336400896 35336400896 35336400896 35336400896
-      3525385519.104 3533640089'
+        3525385519.104 3533640089'
     - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
-      --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
-      mountpoint="/"}'
+        --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
+        mountpoint="/"}'
       values: '73445531648 73445531648 73445531648 73445531648 73445531648
-      73445531648 73445531648'
+        73445531648 73445531648'
    promql_expr_test:
      - expr: node_filesystem_avail_bytes{mountpoint="/"} /
              node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
        eval_time: 5m
        exp_samples:
          - labels: '{device="/dev/mapper/fedora_localhost --live-home",
-           fstype="ext4", instance="node-exporter", job="node-exporter",
-           mountpoint="/"}'
+            fstype="ext4", instance="node-exporter", job="node-exporter",
+            mountpoint="/"}'
            value: 4.8E+00
    alert_rule_test:
      - eval_time: 10m
@@ -377,17 +354,13 @@ tests:
  # network packets dropped
  - interval: 1m
    input_series:
-    - series: 'node_network_receive_drop_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_drop_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+600x10'
-    - series: 'node_network_transmit_drop_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_drop_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+600x10'
-    - series: 'node_network_receive_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_packets_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+750x10'
-    - series: 'node_network_transmit_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_packets_total{device="eth0",instance="node-exporter",job="node-exporter"}'
       values: '0+750x10'
    promql_expr_test:
      - expr: |
@@ -404,8 +377,7 @@ tests:
 
        eval_time: 5m
        exp_samples:
-         - labels: '{device="eth0", instance="node-exporter",
-           job="node-exporter"}'
+         - labels: '{device="eth0", instance="node-exporter",job="node-exporter"}'
            value: 8E-1
    alert_rule_test:
      - eval_time: 5m
@@ -425,17 +397,13 @@ tests:
  # network packets errors
  - interval: 1m
    input_series:
-    - series: 'node_network_receive_errs_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_errs_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+600x10'
-    - series: 'node_network_transmit_errs_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_errs_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+600x10'
-    - series: 'node_network_transmit_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_transmit_packets_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+750x10'
-    - series: 'node_network_receive_packets_total{device="eth0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_network_receive_packets_total{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0+750x10'
    promql_expr_test:
      - expr: |
@@ -452,8 +420,7 @@ tests:
 
        eval_time: 5m
        exp_samples:
-         - labels: '{device="eth0", instance="node-exporter",
-           job="node-exporter"}'
+         - labels: '{device="eth0", instance="node-exporter",job="node-exporter",cluster="mycluster"}'
            value: 8E-01
    alert_rule_test:
      - eval_time: 5m
@@ -466,26 +433,24 @@ tests:
            oid: 1.3.6.1.4.1.50495.1.2.1.8.3
            severity: warning
            type: ceph_default
-         exp_annotations: 
-           summary: One or more NICs reports packet errors
+           cluster: mycluster
+         exp_annotations:
+           summary: One or more NICs reports packet errors on cluster mycluster
            description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
 
  # Bond is missing a peer
  - interval: 1m
    input_series:
-    - series: 'node_bonding_active{master="bond0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_bonding_active{master="bond0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '3'
-    - series: 'node_bonding_slaves{master="bond0",
-      instance="node-exporter",job="node-exporter"}'
+    - series: 'node_bonding_slaves{master="bond0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '4'
    promql_expr_test:
      - expr: |
          node_bonding_slaves - node_bonding_active != 0
        eval_time: 5m
        exp_samples:
-         - labels: '{master="bond0", instance="node-exporter",
-           job="node-exporter"}'
+         - labels: '{master="bond0", instance="node-exporter",job="node-exporter",cluster="mycluster"}'
            value: 1
    alert_rule_test:
      - eval_time: 5m
@@ -497,23 +462,22 @@ tests:
            job: node-exporter
            severity: warning
            type: ceph_default
+           cluster: mycluster
          exp_annotations:
-           summary: Degraded Bond on Node node-exporter
+           summary: Degraded Bond on Node node-exporter on cluster mycluster
            description: "Bond bond0 is degraded on Node node-exporter."
 
 # Node Storage disk space filling up
  - interval: 1m
    # 20GB = 21474836480, 256MB = 268435456
    input_series:
-    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
-      fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
+    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",fstype="xfs",instance="node-1",mountpoint="/rootfs",cluster="mycluster"}'
       values: '21474836480-268435456x48'
-    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
-      fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
+    - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",fstype="xfs",instance="node-2",mountpoint="/rootfs",cluster="mycluster"}'
       values: '21474836480+0x48'
-    - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
+    - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com",cluster="mycluster"}'
       values: 1+0x48
-    - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
+    - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com",cluster="mycluster"}'
       values: 1+0x48
    promql_expr_test:
      - expr: |
@@ -521,8 +485,7 @@ tests:
           on(instance) group_left(nodename) node_uname_info < 0
        eval_time: 5m
        exp_samples:
-         - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
-         mountpoint="/rootfs",nodename="node-1.unittests.com"}'
+         - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",mountpoint="/rootfs",nodename="node-1.unittests.com",cluster="mycluster"}'
            value: -1.912602624E+12
    alert_rule_test:
      - eval_time: 5m
@@ -537,72 +500,60 @@ tests:
            instance: node-1
            mountpoint: /rootfs
            nodename: node-1.unittests.com
+           cluster: mycluster
          exp_annotations:
-           summary: Host filesystem free space is getting low
+           summary: Host filesystem free space is getting low on cluster mycluster
            description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
+
  # MTU Mismatch
  - interval: 1m
    input_series:
-    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1500 1500 1500 1500 1500'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '9000 9000 9000 9000 9000'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
       values: '2200 2200 2200 2200 2200'
-    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
-      job="node-exporter"}'
+    - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
       values: '2400 2400 2400 2400 2400'
-    - series: 'node_network_up{device="eth0",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth0",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth1",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth1",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '0 0 0 0 0'
-    - series: 'node_network_up{device="eth2",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth2",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth3",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth3",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="node-exporter",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth4",instance="node-exporter",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname1",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth4",instance="hostname1",job="node-exporter",cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'node_network_up{device="eth4",instance="hostname2",
-      job="node-exporter"}'
+    - series: 'node_network_up{device="eth4",instance="hostname2",job="node-exporter",cluster="mycluster"}'
       values: '0 0 0 0 0'
    promql_expr_test:
      - expr: |
           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
             scalar(
-              max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+              max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
+                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
             )
           or
           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
             scalar(
-              min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
-                quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
+              min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
+                quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
             )
        eval_time: 1m
        exp_samples:
-         - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
+         - labels: '{device="eth4", instance="node-exporter", job="node-exporter", cluster="mycluster"}'
            value: 9000
-         - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
+         - labels: '{device="eth4", instance="hostname1", job="node-exporter", cluster="mycluster"}'
            value: 2200
    alert_rule_test:
      - eval_time: 1m
@@ -614,8 +565,9 @@ tests:
            job: node-exporter
            severity: warning
            type: ceph_default
+           cluster: "mycluster"
          exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent
+           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
            description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
        - exp_labels:
            device: eth4
@@ -623,51 +575,52 @@ tests:
            job: node-exporter
            severity: warning
            type: ceph_default
+           cluster: "mycluster"
          exp_annotations:
-           summary: MTU settings across Ceph hosts are inconsistent
+           summary: MTU settings across Ceph hosts are inconsistent on cluster mycluster
            description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
 
  # pool full, data series has 6 but using topk(5) so to ensure the
  # results are working as expected
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="POOL_FULL"}'
+    - series: 'ceph_health_detail{name="POOL_FULL", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1'
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", cluster="mycluster"}'
       values: '32+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="2"}'
+    - series: 'ceph_pool_percent_used{pool_id="2", cluster="mycluster"}'
       values: '96+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="3"}'
+    - series: 'ceph_pool_percent_used{pool_id="3", cluster="mycluster"}'
       values: '90+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="4"}'
+    - series: 'ceph_pool_percent_used{pool_id="4", cluster="mycluster"}'
       values: '72+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="5"}'
+    - series: 'ceph_pool_percent_used{pool_id="5", cluster="mycluster"}'
       values: '19+0x10'
-    - series: 'ceph_pool_percent_used{pool_id="6"}'
+    - series: 'ceph_pool_percent_used{pool_id="6", cluster="mycluster"}'
       values: '10+0x10'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="cephfs_data",pool_id="1"}'
+        name="cephfs_data",pool_id="1", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="rbd",pool_id="2"}'
+        name="rbd",pool_id="2", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="iscsi",pool_id="3"}'
+        name="iscsi",pool_id="3", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="default.rgw.index",pool_id="4"}'
+        name="default.rgw.index",pool_id="4", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="default.rgw.log",pool_id="5"}'
+        name="default.rgw.log",pool_id="5", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
-      name="dummy",pool_id="6"}'
+        name="dummy",pool_id="6", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="POOL_FULL"} > 0
        eval_time: 5m
        exp_samples:
-         - labels:  '{__name__="ceph_health_detail", name="POOL_FULL"}'
+         - labels:  '{__name__="ceph_health_detail", name="POOL_FULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
      - eval_time: 1m
@@ -678,23 +631,24 @@ tests:
        - exp_labels:
            name: POOL_FULL
            severity: critical
+           cluster: mycluster
            type: ceph_default
            oid: 1.3.6.1.4.1.50495.1.2.1.9.1
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
-           summary: Pool is full - writes are blocked
+           summary: Pool is full - writes are blocked on cluster mycluster
            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
+
  # slow OSD ops
  - interval : 1m
    input_series:
-    - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
+    - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph",cluster="mycluster"}'
       values: '1+0x120'
    promql_expr_test:
      - expr: ceph_healthcheck_slow_ops > 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
-           job="ceph"}'
+         - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",job="ceph",cluster="mycluster"}'
            value: 1
    alert_rule_test:
      - eval_time: 20m
@@ -704,23 +658,23 @@ tests:
            instance: ceph:9283
            job: ceph
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
-           summary: OSD operations are slow to complete
+           summary: OSD operations are slow to complete on cluster mycluster
            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 
  # slow daemon ops
  - interval : 1m
    input_series:
-    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283", job="ceph", type="SLOW_OPS", cluster="mycluster"}'
       values: '1+0x120'
    promql_expr_test:
      - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
-           job="ceph", type="SLOW_OPS"}'
+         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1", instance="ceph:9283", cluster="mycluster",job="ceph", type="SLOW_OPS"}'
            value: 1
    alert_rule_test:
      - eval_time: 20m
@@ -731,22 +685,23 @@ tests:
            ceph_daemon: "osd.1"
            job: ceph
            severity: warning
+           cluster: mycluster
            type: ceph_default
          exp_annotations:
            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
-           summary: osd.1 operations are slow to complete
+           summary: osd.1 operations are slow to complete on cluster mycluster
            description: "osd.1 operations are taking too long to process (complaint time exceeded)"
 
 # CEPHADM orchestrator alert triggers
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
+    - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
+         - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -757,20 +712,21 @@ tests:
       - exp_labels:
           name: UPGRADE_EXCEPTION
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.11.2
         exp_annotations:
-          summary: Ceph version upgrade has failed
+          summary: Ceph version upgrade has failed on cluster mycluster
           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
+    - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -781,20 +737,21 @@ tests:
       - exp_labels:
           name: CEPHADM_FAILED_DAEMON
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.11.1
         exp_annotations:
-          summary: A ceph daemon managed by cephadm is down
+          summary: A ceph daemon managed by cephadm is down on cluster mycluster
           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
+    - series: 'ceph_health_detail{name="CEPHADM_PAUSED", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
+         - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -805,21 +762,23 @@ tests:
       - exp_labels:
           name: CEPHADM_PAUSED
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
-          summary: Orchestration tasks via cephadm are PAUSED
+          summary: Orchestration tasks via cephadm are PAUSED on cluster mycluster
           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
+
 # MDS
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
+    - series: 'ceph_health_detail{name="MDS_DAMAGE", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -830,21 +789,22 @@ tests:
       - exp_labels:
           name: MDS_DAMAGE
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
-          summary: CephFS filesystem is damaged.
+          summary: CephFS filesystem is damaged on cluster mycluster
           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
+    - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -855,21 +815,22 @@ tests:
       - exp_labels:
           name: MDS_HEALTH_READ_ONLY
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
-          summary: CephFS filesystem in read only mode due to write error(s)
+          summary: CephFS filesystem in read only mode due to write error(s) on cluster mycluster
           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
+    - series: 'ceph_health_detail{name="MDS_ALL_DOWN", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -880,21 +841,22 @@ tests:
       - exp_labels:
           name: MDS_ALL_DOWN
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.3
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
-          summary: CephFS filesystem is offline
+          summary: CephFS filesystem is offline on cluster mycluster
           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="FS_DEGRADED"}'
+    - series: 'ceph_health_detail{name="FS_DEGRADED", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
+         - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -905,21 +867,22 @@ tests:
       - exp_labels:
           name: FS_DEGRADED
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.4
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
-          summary: CephFS filesystem is degraded
+          summary: CephFS filesystem is degraded on cluster mycluster
           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
+    - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -930,20 +893,21 @@ tests:
       - exp_labels:
           name: MDS_INSUFFICIENT_STANDBY
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
-          summary: Ceph filesystem standby daemons too few
+          summary: Ceph filesystem standby daemons too few on cluster mycluster
           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
+    - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
+         - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -954,21 +918,22 @@ tests:
       - exp_labels:
           name: FS_WITH_FAILED_MDS
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.5.5
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
-          summary: MDS daemon failed, no further standby available
+          summary: MDS daemon failed, no further standby available on cluster mycluster
           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
+    - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX", cluster="mycluster"}'
       values: '0 0 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
+         - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -979,10 +944,11 @@ tests:
       - exp_labels:
           name: MDS_UP_LESS_THAN_MAX
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
-          summary: Ceph MDS daemon count is lower than configured
+          summary: Ceph MDS daemon count is lower than configured on cluster mycluster
           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
 # MGR
  - interval: 1m
@@ -1012,13 +978,13 @@ tests:
           description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
+    - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
+         - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1029,24 +995,26 @@ tests:
       - exp_labels:
           name: RECENT_MGR_MODULE_CRASH
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.6.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
-          summary: A manager module has recently crashed
+          summary: A manager module has recently crashed on cluster mycluster
           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
+
 # MON
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
+    - series: 'ceph_health_detail{name="MON_DISK_CRIT", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a", cluster="mycluster"}'
       values: '1+0x13'
    promql_expr_test:
      - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
+         - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1057,23 +1025,24 @@ tests:
       - exp_labels:
           name: "MON_DISK_CRIT"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.3.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
-          summary: Filesystem space on at least one monitor is critically low
+          summary: Filesystem space on at least one monitor is critically low on cluster mycluster
           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
+    - series: 'ceph_health_detail{name="MON_DISK_LOW", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a", cluster="mycluster"}'
       values: '1+0x13'
    promql_expr_test:
      - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
+         - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1084,20 +1053,21 @@ tests:
       - exp_labels:
           name: "MON_DISK_LOW"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
-          summary: Drive space on at least one monitor is approaching full
+          summary: Drive space on at least one monitor is approaching full on cluster mycluster
           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
+    - series: 'ceph_health_detail{name="MON_CLOCK_SKEW", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
+         - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1108,34 +1078,35 @@ tests:
       - exp_labels:
           name: "MON_CLOCK_SKEW"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
-          summary: Clock skew detected among monitors
+          summary: Clock skew detected among monitors on cluster mycluster
           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
 
 # Check 3 mons one down, quorum at risk
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="MON_DOWN"}'
+    - series: 'ceph_health_detail{name="MON_DOWN", cluster="mycluster"}'
       values: '0+0x2 1+0x12'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c", cluster="mycluster"}'
       values: '1+0x2 0+0x12'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3", cluster="mycluster"}'
       values: '1+0x14'
    promql_expr_test:
-     - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
+     - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (count(ceph_mon_quorum_status == 1) by (cluster) == bool (floor(count(ceph_mon_metadata) by (cluster) / 2) + 1))) == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1146,40 +1117,41 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.3.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
-          summary: Monitor quorum is at risk
+          summary: Monitor quorum is at risk on cluster mycluster
           description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
 # check 5 mons, 1 down - warning only
  - interval: 1m
    input_series:
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
+    - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e", cluster="mycluster"}'
       values: '1+0x2 0+0x12'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4", cluster="mycluster"}'
       values: '1+0x14'
-    - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
+    - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5", cluster="mycluster"}'
       values: '1+0x14'
    promql_expr_test:
-     - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
+     - expr: (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
        eval_time: 3m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1189,21 +1161,23 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
-          summary: One or more monitors down
-          description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down:   - mon.e on ceph-mon-5\n"
+          summary: One or more monitors down on cluster mycluster
+          description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5"
+
 # Device Health
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1214,20 +1188,21 @@ tests:
       - exp_labels:
           name: "DEVICE_HEALTH"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
-          summary: Device(s) predicted to fail soon
+          summary: Device(s) predicted to fail soon on cluster mycluster
           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1238,21 +1213,22 @@ tests:
       - exp_labels:
           name: "DEVICE_HEALTH_TOOMANY"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.7
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
-          summary: Too many devices are predicted to fail, unable to resolve
+          summary: Too many devices are predicted to fail on cluster mycluster, unable to resolve
           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
+    - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
+         - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1263,25 +1239,27 @@ tests:
       - exp_labels:
           name: "DEVICE_HEALTH_IN_USE"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
-          summary: Device failure is predicted, but unable to relocate data
+          summary: Device failure is predicted, but unable to relocate data on cluster mycluster
           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
+
 # OSD
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
+    - series: 'ceph_health_detail{name="OSD_HOST_DOWN", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1+0x2 0+0x10'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1", cluster="mycluster"}'
       values: '1+0x12'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1292,20 +1270,21 @@ tests:
       - exp_labels:
           name: "OSD_HOST_DOWN"
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.8
         exp_annotations:
-          summary: An OSD host is offline
+          summary: An OSD host is offline on cluster mycluster
           description: "The following OSDs are down: - ceph-osd-1 : osd.0"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
+    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1316,19 +1295,20 @@ tests:
       - exp_labels:
           name: "OSD_SLOW_PING_TIME_FRONT"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: Network issues delaying OSD heartbeats (public network)
+          summary: Network issues delaying OSD heartbeats (public network) on cluster mycluster
           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
+    - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1339,19 +1319,20 @@ tests:
       - exp_labels:
           name: "OSD_SLOW_PING_TIME_BACK"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: Network issues delaying OSD heartbeats (cluster network)
+          summary: Network issues delaying OSD heartbeats (cluster network) on cluster mycluster
           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
+    - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
+         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1362,20 +1343,21 @@ tests:
       - exp_labels:
           name: "BLUESTORE_DISK_SIZE_MISMATCH"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
-          summary: OSD size inconsistency error
+          summary: OSD size inconsistency error on cluster mycluster
           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
+    - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
+         - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1386,32 +1368,33 @@ tests:
       - exp_labels:
           name: "BLUESTORE_SPURIOUS_READ_ERRORS"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
-          summary: Device read errors detected
+          summary: Device read errors detected on cluster mycluster
           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_DOWN"}'
+    - series: 'ceph_health_detail{name="OSD_DOWN", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1", cluster="mycluster"}'
       values: '1+0x2 0+0x10'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2", cluster="mycluster"}'
       values: '1+0x12'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3", cluster="mycluster"}'
       values: '1+0x12'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_DOWN"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_DOWN", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1422,21 +1405,22 @@ tests:
       - exp_labels:
           name: "OSD_DOWN"
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
-          summary: An OSD has been marked down
-          description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
+          summary: An OSD has been marked down on cluster mycluster
+          description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
+    - series: 'ceph_health_detail{name="OSD_NEARFULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1447,21 +1431,22 @@ tests:
       - exp_labels:
           name: "OSD_NEARFULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.3
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
-          summary: OSD(s) running low on free space (NEARFULL)
+          summary: OSD(s) running low on free space (NEARFULL) on cluster mycluster
           description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_FULL"}'
+    - series: 'ceph_health_detail{name="OSD_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_FULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_FULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1472,21 +1457,22 @@ tests:
       - exp_labels:
           name: "OSD_FULL"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.4.6
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
-          summary: OSD full, writes blocked
+          summary: OSD full, writes blocked on cluster mycluster
           description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
+    - series: 'ceph_health_detail{name="OSD_BACKFILLFULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1497,20 +1483,21 @@ tests:
       - exp_labels:
           name: "OSD_BACKFILLFULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
-          summary: OSD(s) too full for backfill operations
+          summary: OSD(s) too full for backfill operations on cluster mycluster
           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
+    - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
+         - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1521,38 +1508,40 @@ tests:
       - exp_labels:
           name: "OSD_TOO_MANY_REPAIRS"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
-          summary: OSD reports a high number of read errors
+          summary: OSD reports a high number of read errors on cluster mycluster
           description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
+
 # Pools
    # trigger percent full prediction on pools 1 and 2 only
  - interval: 12h
    input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090", cluster="mycluster"}'
       values: '78 89 79 98 78'
-    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090", cluster="mycluster"}'
       values: '22 22 23 23 24'
-    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated", cluster="mycluster"}'
       values: '1 1 1 1 1'
    promql_expr_test:
      - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster, pool_id, instance)
               group_right() ceph_pool_metadata) >= 95
        eval_time: 36h
        exp_samples:
-         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated", cluster="mycluster"}'
            value: 1.435E+02 # 142%
    alert_rule_test:
     - eval_time: 48h
@@ -1563,20 +1552,21 @@ tests:
           name: default.rgw.index
           pool_id: 1
           severity: warning
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
         exp_annotations:
-          summary: Pool growth rate may soon exceed capacity
+          summary: Pool growth rate may soon exceed capacity on cluster mycluster
           description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
+    - series: 'ceph_health_detail{name="POOL_BACKFILLFULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
+         - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1587,20 +1577,21 @@ tests:
       - exp_labels:
           name: "POOL_BACKFILLFULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: Free space in a pool is too low for recovery/backfill
+          summary: Free space in a pool is too low for recovery/backfill on cluster mycluster
           description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
 
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
+    - series: 'ceph_health_detail{name="POOL_NEAR_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1611,21 +1602,22 @@ tests:
       - exp_labels:
           name: "POOL_NEAR_FULL"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: One or more Ceph pools are nearly full
+          summary: One or more Ceph pools are nearly full on cluster mycluster
           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
 
 # PGs
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
+    - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED",cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1636,20 +1628,21 @@ tests:
       - exp_labels:
           name: "PG_NOT_SCRUBBED"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
-          summary: Placement group(s) have not been scrubbed
+          summary: Placement group(s) have not been scrubbed on cluster mycluster
           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_DAMAGED"}'
+    - series: 'ceph_health_detail{name="PG_DAMAGED",cluster="mycluster"}'
       values: '0+0x4 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
        eval_time: 5m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1660,21 +1653,22 @@ tests:
       - exp_labels:
           name: "PG_DAMAGED"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.4
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
-          summary: Placement group damaged, manual intervention needed
+          summary: Placement group damaged, manual intervention needed on cluster mycluster
           description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
+    - series: 'ceph_health_detail{name="TOO_MANY_PGS",cluster="mycluster"}'
       values: '0+0x4 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
        eval_time: 5m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
+         - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1685,20 +1679,21 @@ tests:
       - exp_labels:
           name: "TOO_MANY_PGS"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
-          summary: Placement groups per OSD is too high
+          summary: Placement groups per OSD is too high on cluster mycluster
           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
+    - series: 'ceph_health_detail{name="PG_RECOVERY_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1709,21 +1704,22 @@ tests:
       - exp_labels:
           name: "PG_RECOVERY_FULL"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.5
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
-          summary: OSDs are too full for recovery
+          summary: OSDs are too full for recovery on cluster mycluster
           description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
+    - series: 'ceph_health_detail{name="PG_BACKFILL_FULL", cluster="mycluster"}'
       values: '0+0x2 1+0x20'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL", cluster="mycluster"}'
            value: 0
    alert_rule_test:
     - eval_time: 1m
@@ -1734,17 +1730,18 @@ tests:
       - exp_labels:
           name: "PG_BACKFILL_FULL"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.6
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
-          summary: Backfill operations are blocked due to lack of free space
+          summary: Backfill operations are blocked due to lack of free space on cluster mycluster
           description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
+    - series: 'ceph_health_detail{name="PG_AVAILABILITY", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_health_detail{name="OSD_DOWN"}'
+    - series: 'ceph_health_detail{name="OSD_DOWN", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
    promql_expr_test:
      - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
@@ -1767,21 +1764,22 @@ tests:
       - exp_labels:
           name: "PG_AVAILABILITY"
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.7.3
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
-          summary: PG is unavailable, blocking I/O
+          summary: PG is unavailable on cluster mycluster, blocking I/O
           description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
+    - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED", cluster="mycluster"}'
       values: '0+0x2 1+0x10'
    promql_expr_test:
      - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
        eval_time: 3m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
+         - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -1792,10 +1790,11 @@ tests:
       - exp_labels:
           name: "PG_NOT_DEEP_SCRUBBED"
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
-          summary: Placement group(s) have not been deep scrubbed
+          summary: Placement group(s) have not been deep scrubbed on cluster mycluster
           description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
 
 # Prometheus
@@ -1821,25 +1820,26 @@ tests:
         exp_annotations:
           summary: The scrape job for Ceph is missing from Prometheus
           description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.
+
 # RADOS
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
+    - series: 'ceph_health_detail{name="OBJECT_UNFOUND", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.1", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
+    - series: 'ceph_osd_up{ceph_daemon="osd.2", cluster="mycluster"}'
       values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.0", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.1", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
-    - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
+    - series: 'ceph_osd_metadata{ceph_daemon="osd.2", cluster="mycluster"}'
       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
    promql_expr_test:
-     - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
+     - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right (cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by (cluster)) == 1
        eval_time: 1m
        exp_samples:
    alert_rule_test:
@@ -1853,16 +1853,18 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.10.1
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
-          summary: Object(s) marked UNFOUND
+          summary: Object(s) marked UNFOUND on cluster mycluster
           description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
+
 # Generic Alerts
  - interval: 1m
    input_series:
-    - series: 'ceph_health_detail{name="RECENT_CRASH"}'
+    - series: 'ceph_health_detail{name="RECENT_CRASH", cluster="mycluster"}'
       values: '0 0 0 1 1 1 1 1 1 1 1'
    promql_expr_test:
      - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
@@ -1880,11 +1882,12 @@ tests:
       - exp_labels:
           name: RECENT_CRASH
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.1.2
         exp_annotations:
           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
-          summary: One or more Ceph daemons have crashed, and are pending acknowledgement
+          summary: One or more Ceph daemons have crashed, and are pending acknowledgement on cluster mycluster
           description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.
 
   # new rbdmirror alerts tests
@@ -1892,21 +1895,21 @@ tests:
   # alert: CephRBDMirrorImagesPerDaemonHigh
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data", cluster="mycluster"}'
        values: '0+0x20 1+1x130'
-     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_snapshots{ceph_daemon="client.admin.40628", image="image2", namespace="default", pool="data", cluster="mycluster"}'
        values: '1+1x130 131+0x20'
    # prometheus query test
    promql_expr_test:
      # negative test where there are no samples
-     - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
+     - expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
        eval_time: 50m
        exp_samples:
      # second positive test
-     - expr: sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
+     - expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100
        eval_time: 70m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628", namespace="default"}'
+         - labels: '{ceph_daemon="client.admin.40628", namespace="default", cluster="mycluster"}'
            value: 121
    # prometheus alert test
    alert_rule_test:
@@ -1921,31 +1924,32 @@ tests:
        - exp_labels:
            oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
            severity: "critical"
+           cluster: mycluster
            type: "ceph_default"
            ceph_daemon: "client.admin.40628"
            namespace: "default"
          exp_annotations:
-           description: "Number of image replications per daemon is not suppossed to go beyond threshold 100"
-           summary: "Number of image replications are now above 100"
+           summary: "Number of image replications are now above 100 on cluster mycluster"
+           description: "Number of image replications per daemon is not supposed to go beyond threshold 100"
 
  # alert: CephRBDMirrorImagesNotInSync
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 3.21+0x20'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 2.03+0x20'
    # prometheus query test
    promql_expr_test:
      # negative test where there are no samples
-     - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
+     - expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
        eval_time: 30m
        exp_samples:
        # second positive test
-     - expr: sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
+     - expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0
        eval_time: 45m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data"}'
+         - labels: '{ceph_daemon="client.admin.40628", image="image1", namespace="default", pool="data", cluster="mycluster"}'
            value: 1.1800000000000002
    # prometheus alert test
    alert_rule_test:
@@ -1962,48 +1966,49 @@ tests:
              pool: "data"
              oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
              severity: "critical"
+             cluster: mycluster
              type: "ceph_default"
              ceph_daemon: "client.admin.40628"
              namespace: "default"
            exp_annotations:
+             summary: "Some of the RBD mirror images are not in sync with the remote counter parts on cluster mycluster"
              description: "Both local and remote RBD mirror images should be in sync."
-             summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
 
  # alert: CephRBDMirrorImagesNotInSyncVeryHigh
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 3.21+0x20'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image1",namespace="default",pool="data",cluster="mycluster"}'
        values: '1.678+0x20 2.03+0x20 2.03+0x20'
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 3.301+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image2",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 7.13+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 3.301+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image3",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x20 3.301+0x14 7.13+0x26'
-     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_local_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x65'
-     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data"}'
+     - series: 'ceph_rbd_mirror_snapshot_image_remote_timestamp{ceph_daemon="client.admin.40628",image="image4",namespace="default",pool="data",cluster="mycluster"}'
        values: '2.189+0x65'
-     - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628"}'
+     - series: 'ceph_rbd_mirror_snapshot_snapshots{ceph_daemon="client.admin.40628",cluster="mycluster"}'
        values: '1+0x20 2+0x45'
    # prometheus query test
    promql_expr_test:
      # test each query individually
      # query 1
-     - expr: count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0)
+     - expr: count by (ceph_daemon, cluster) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0)
        eval_time: 45m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628",cluster="mycluster"}'
            value: 3
      # query 2
-     - expr: sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots) * .1
+     - expr: sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots) * .1
        eval_time: 45m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628",cluster="mycluster"}'
            value: 0.2
    # prometheus alert test
    alert_rule_test:
@@ -2019,15 +2024,16 @@ tests:
              ceph_daemon: "client.admin.40628"
              oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
              severity: "critical"
+             cluster: mycluster
              type: "ceph_default"
            exp_annotations:
-             description: "More than 10% of the images have synchronization problems"
-             summary: "Number of unsynchronized images are very high."
+             summary: "Number of unsynchronized images are very high on cluster mycluster"
+             description: "More than 10% of the images have synchronization problems."
 
  # alert: "CephRBDMirrorImageTransferBandwidthHigh"
  - interval: 1m
    input_series:
-     - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628"}'
+     - series: 'ceph_rbd_mirror_journal_replay_bytes{ceph_daemon="client.admin.40628", cluster="mycluster"}'
        values: '0+0x10 1+0x5 10+30x25 736+200x30'
    # prometheus query test
    promql_expr_test:
@@ -2036,25 +2042,25 @@ tests:
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 5m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 0.0
      # rate 2
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 20m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 0.33
      # rate 3
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 40m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 0.5
      # rate 4
      - expr: rate(ceph_rbd_mirror_journal_replay_bytes[5m])
        eval_time: 50m
        exp_samples:
-         - labels: '{ceph_daemon="client.admin.40628"}'
+         - labels: '{ceph_daemon="client.admin.40628", cluster="mycluster"}'
            value: 3.3333333333333335
    # prometheus alert test
    alert_rule_test:
@@ -2070,20 +2076,21 @@ tests:
            ceph_daemon: "client.admin.40628"
            oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
            severity: "warning"
+           cluster: mycluster
            type: "ceph_default"
          exp_annotations:
+           summary: "The replication network usage on cluster mycluster has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
            description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
-           summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
 
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_STORAGE"}'
+    - series: 'ceph_health_detail{name="HARDWARE_STORAGE", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2094,20 +2101,21 @@ tests:
       - exp_labels:
           name: HARDWARE_STORAGE
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.1
         exp_annotations:
-          summary: Storage devices error(s) detected
+          summary: Storage devices error(s) detected on cluster mycluster
           description: "Some storage devices are in error. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_MEMORY"}'
+    - series: 'ceph_health_detail{name="HARDWARE_MEMORY", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2118,20 +2126,21 @@ tests:
       - exp_labels:
           name: HARDWARE_MEMORY
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.2
         exp_annotations:
-          summary: DIMM error(s) detected
+          summary: DIMM error(s) detected on cluster mycluster
           description: "DIMM error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}'
+    - series: 'ceph_health_detail{name="HARDWARE_PROCESSOR", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2142,20 +2151,21 @@ tests:
       - exp_labels:
           name: HARDWARE_PROCESSOR
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.3
         exp_annotations:
-          summary: Processor error(s) detected
+          summary: Processor error(s) detected on cluster mycluster
           description: "Processor error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_NETWORK"}'
+    - series: 'ceph_health_detail{name="HARDWARE_NETWORK", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2166,20 +2176,21 @@ tests:
       - exp_labels:
           name: HARDWARE_NETWORK
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.4
         exp_annotations:
-          summary: Network error(s) detected
+          summary: Network error(s) detected on cluster mycluster
           description: "Network error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_POWER"}'
+    - series: 'ceph_health_detail{name="HARDWARE_POWER", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_POWER"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2190,20 +2201,21 @@ tests:
       - exp_labels:
           name: HARDWARE_POWER
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.5
         exp_annotations:
-          summary: Power supply error(s) detected
+          summary: Power supply error(s) detected on cluster mycluster
           description: "Power supply error(s) detected. Check `ceph health detail`."
  - interval: 30s
    input_series:
-    - series: 'ceph_health_detail{name="HARDWARE_FANS"}'
+    - series: 'ceph_health_detail{name="HARDWARE_FANS", cluster="mycluster"}'
       values: '1+0x40'
    promql_expr_test:
      - expr: ceph_health_detail{name="HARDWARE_FANS"} > 0
        eval_time: 2m
        exp_samples:
-         - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}'
+         - labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 1m
@@ -2214,35 +2226,36 @@ tests:
       - exp_labels:
           name: HARDWARE_FANS
           severity: critical
+          cluster: mycluster
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.13.6
         exp_annotations:
-          summary: Fan error(s) detected
+          summary: Fan error(s) detected on cluster mycluster
           description: "Fan error(s) detected. Check `ceph health detail`."
 
 # nvmeof Tests
  # NVMeoFSubsystemNamespaceLimit
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_namespace_limit{nqn="wah"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_limit{nqn="wah", cluster="mycluster"}'
       values: '5x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk1"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk1", cluster="mycluster"}'
+      values: '1x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk2", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk2"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk3", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk3"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk4", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk4"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk5", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk5"}'
+    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk6", cluster="mycluster"}'
       values: '1x10'
-    - series: 'ceph_nvmeof_subsystem_namespace_metadata{nqn="wah", bdev_name="disk6"}'
-      values: '1x10'      
    promql_expr_test:
-     - expr: (count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit
+     - expr: (count by(nqn, cluster) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit
        eval_time: 1m
        exp_samples:
-         - labels: '{nqn="wah"}'
+         - labels: '{nqn="wah",cluster="mycluster"}'
            value: 6
    alert_rule_test:
     - eval_time: 5m
@@ -2251,29 +2264,30 @@ tests:
       - exp_labels:
           nqn: wah
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "wah subsystem has reached its maximum number of namespaces "
+          summary: "wah subsystem has reached its maximum number of namespaces on cluster mycluster"
           description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to wah"
 
  # NVMeoFTooManyGateways
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}'
-      values: '1+0x20'      
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5"}'
-      values: '1+0x20'             
-   promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) > 4.00
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
+      values: '1+0x20'
+   promql_expr_test:
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 5
    alert_rule_test:
     - eval_time: 5m
@@ -2281,30 +2295,35 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Max supported gateways exceeded "
+          summary: "Max supported gateways exceeded on cluster mycluster"
           description: "You may create many gateways, but 4 is the tested limit"
 
  # NVMeoFMaxGatewayGroupSize
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.1",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.9",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.3"}'
-      values: '1+0x20'      
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
-      values: '1+0x20'             
-   promql_expr_test:
-     - expr: count by(group) (ceph_nvmeof_gateway_info) > 2.00
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
+      values: '1+0x20'
+   promql_expr_test:
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00
        eval_time: 1m
        exp_samples:
-         - labels: '{group="group-1"}'
-           value: 3
+         - labels: '{cluster="mycluster",group="group-1"}'
+           value: 5
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFMaxGatewayGroupSize
@@ -2312,25 +2331,26 @@ tests:
       - exp_labels:
           group: group-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Max gateways within a gateway group (group-1) exceeded "
-          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
+          summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
+          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
 
  # NVMeoFSingleGatewayGroup
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2"}'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.2",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
       values: '1+0x20'
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4"}'
-      values: '1+0x20' 
-    - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5"}'
-      values: '1+0x20'             
    promql_expr_test:
-     - expr: count by(group) (ceph_nvmeof_gateway_info) == 1
+     - expr: count by(group, cluster) (ceph_nvmeof_gateway_info) == 1
        eval_time: 1m
        exp_samples:
-         - labels: '{group="group-1"}'
+         - labels: '{group="group-1", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2339,21 +2359,22 @@ tests:
       - exp_labels:
           group: group-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The gateway group group-1 consists of a single gateway - HA is not possible "
-          description: "Although a single member gateway group is valid, it should only be used for test purposes" 
+          summary: "The gateway group group-1 consists of a single gateway - HA is not possible on cluster mycluster"
+          description: "Although a single member gateway group is valid, it should only be used for test purposes"
 
  # NVMeoFHighGatewayCPU
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_reactor_seconds_total{mode="busy",name="nvmf_tgt_poll_group_0",instance="node-1:10008"}'
+    - series: 'ceph_nvmeof_reactor_seconds_total{mode="busy",name="nvmf_tgt_poll_group_0",instance="node-1:10008",cluster="mycluster"}'
       values: '880+5080x20'
    promql_expr_test:
-     - expr: label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80
+     - expr: label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80
        eval_time: 5m
        exp_samples:
-         - labels: '{instance="node-1"}'
+         - labels: '{instance="node-1", cluster="mycluster"}'
            value: 8.466666666666667E+01
    alert_rule_test:
     - eval_time: 15m
@@ -2362,23 +2383,24 @@ tests:
       - exp_labels:
           instance: node-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "CPU used by node-1 NVMe-oF Gateway is high "
-          description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores" 
- 
+          summary: "CPU used by node-1 NVMe-oF Gateway is high on cluster mycluster"
+          description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
+
  # NVMeoFGatewayOpenSecurity
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.good", allow_any_host="no"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.good", allow_any_host="no", cluster="mycluster"}'
       values: '1+0x10'
-    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.bad", allow_any_host="yes"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{nqn="nqn.bad", allow_any_host="yes", cluster="mycluster"}'
       values: '1+0x10'
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_metadata",nqn="nqn.bad",allow_any_host="yes"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_metadata",nqn="nqn.bad",allow_any_host="yes", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2388,53 +2410,54 @@ tests:
           allow_any_host: yes
           nqn: nqn.bad
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "Subsystem nqn.bad has been defined without host level security "
-          description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss" 
+          summary: "Subsystem nqn.bad has been defined without host level security on cluster mycluster"
+          description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
 
  # NVMeoFTooManySubsystems
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn1"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn1",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn2",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn3",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn4",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn5",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn6",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn7",cluster="mycluster"}'
       values: '1+0x10'
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn2"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn3"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn4"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn5"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn6"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn7"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn8"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn9"}'
-      values: '1+0x10'                                             
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn10"}'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn8",cluster="mycluster"}'
       values: '1+0x10'
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn11"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn12"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn13"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn14"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn15"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn16"}'
-      values: '1+0x10'  
-    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn17"}'
-      values: '1+0x10'  
-   promql_expr_test:
-     - expr: count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > 16
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn9",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn12",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn13",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn14",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn15",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn16",cluster="mycluster"}'
+      values: '1+0x10'
+    - series: 'ceph_nvmeof_subsystem_metadata{instance="node-1:10008",nqn="nqn17",cluster="mycluster"}'
+      values: '1+0x10'
+   promql_expr_test:
+     - expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*):.*")) > 16
        eval_time: 1m
        exp_samples:
-         - labels: '{gateway_host="node-1"}'
+         - labels: '{gateway_host="node-1", cluster="mycluster"}'
            value: 17
    alert_rule_test:
     - eval_time: 5m
@@ -2443,23 +2466,24 @@ tests:
       - exp_labels:
           gateway_host: node-1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The number of subsystems defined to the gateway exceeds supported values "
-          description: "Although you may continue to create subsystems in node-1, the configuration may not be supported" 
+          summary: "The number of subsystems defined to the gateway exceeds supported values on cluster mycluster"
+          description: "Although you may continue to create subsystems in node-1, the configuration may not be supported"
 
  # NVMeoFVersionMismatch
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_gateway_info{version="0.0.7"}'
+    - series: 'ceph_nvmeof_gateway_info{version="0.0.7",cluster="mycluster"}'
       values: '1+0x80'
-    - series: 'ceph_nvmeof_gateway_info{version="1.0.0"}'
+    - series: 'ceph_nvmeof_gateway_info{version="1.0.0",cluster="mycluster"}'
       values: '1+0x80'
    promql_expr_test:
-     - expr: count(count by(version) (ceph_nvmeof_gateway_info)) > 1
+     - expr: count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1
        eval_time: 1m
        exp_samples:
-         - labels: '{}'
+         - labels: '{cluster="mycluster"}'
            value: 2
    alert_rule_test:
     - eval_time: 1h
@@ -2467,23 +2491,24 @@ tests:
       exp_alerts:
       - exp_labels:
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The cluster has different NVMe-oF gateway releases active "
-          description: "This may indicate an issue with deployment. Check cephadm logs"  
+          summary: "Too many different NVMe-oF gateway releases active on cluster mycluster"
+          description: "This may indicate an issue with deployment. Check cephadm logs"
 
  # NVMeoFHighClientCount
  - interval: 1m
    input_series:
-    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1"}'
+    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}'
       values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
-    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2"}'
+    - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}'
       values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_host_count > 32.00
        eval_time: 15m
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}'
            value: 38
    alert_rule_test:
     - eval_time: 20m
@@ -2492,55 +2517,57 @@ tests:
       - exp_labels:
           nqn: nqn1
           severity: warning
+          cluster: mycluster
           type: ceph_default
         exp_annotations:
-          summary: "The number of clients connected to nqn1 is too high "
-          description: "The supported limit for clients connecting to a subsystem is 32" 
- 
+          summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
+          description: "The supported limit for clients connecting to a subsystem is 32"
+
  # NVMeoFHighHostCPU
  - interval: 1m
    input_series:
-    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="0"}'
+    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="0",cluster="mycluster"}'
       values: '0+18x10 180+9x20'
-    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="1"}'
+    - series: 'node_cpu_seconds_total{mode="idle",instance="node-1:9100",cpu="1",cluster="mycluster"}'
       values: '0+18x10 180+9x20'
-    - series: 'ceph_nvmeof_gateway_info{instance="node-1:10008"}'
+    - series: 'ceph_nvmeof_gateway_info{instance="node-1:10008",cluster="mycluster"}'
       values: '1.00+0x20'
    promql_expr_test:
      - expr: 100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= 80
        eval_time: 16m
        exp_samples:
-         - labels: '{host="node-1",instance="node-1:10008"}'
+         - labels: '{host="node-1",instance="node-1:10008",cluster="mycluster"}'
            value: 85
    alert_rule_test:
     # negative match at 15m
     - eval_time: 15m
       alertname: NVMeoFHighHostCPU
-    # positive match at 25m      
+    # positive match at 25m
     - eval_time: 25m
       alertname: NVMeoFHighHostCPU
       exp_alerts:
       - exp_labels:
           instance: node-1:10008
           host: node-1
+          cluster: mycluster
           severity: warning
           type: ceph_default
         exp_annotations:
-          summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) "
-          description: "High CPU on a gateway host can lead to CPU contention and performance degradation"  
+          summary: "The CPU is high (85%) on NVMeoF Gateway host (node-1) on cluster mycluster"
+          description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
 
  # NVMeoFInterfaceDown - triggered on eth0 only
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down", device="eth0"}'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="down", device="eth0", cluster="mycluster"}'
+      values: '1+0x30'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="up", device="eth1", cluster="mycluster"}'
       values: '1+0x30'
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{operstate="up", device="eth1"}'
-      values: '1+0x30'      
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}
        eval_time: 1m
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth0", operstate="down"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth0", operstate="down", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2550,24 +2577,25 @@ tests:
           oid: 1.3.6.1.4.1.50495.1.2.1.14.1
           operstate: down
           device: eth0
+          cluster: mycluster
           severity: warning
           type: ceph_default
         exp_annotations:
-          summary: "Network interface eth0 is down "
-          description: "A NIC used by one or more subsystems is in a down state" 
+          summary: "Network interface eth0 is down on cluster mycluster"
+          description: "A NIC used by one or more subsystems is in a down state"
 
  # NVMeoFInterfaceDuplex - triggered on eth1 only
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="full", device="eth0"}'
-      values: '1+0x30'   
-    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="half", device="eth1"}'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="full", device="eth0", cluster="mycluster"}'
+      values: '1+0x30'
+    - series: 'ceph_nvmeof_subsystem_listener_iface_info{duplex="half", device="eth1", cluster="mycluster"}'
       values: '1+0x30'
    promql_expr_test:
      - expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}
        eval_time: 30s
        exp_samples:
-         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth1", duplex="half"}'
+         - labels: '{__name__="ceph_nvmeof_subsystem_listener_iface_info", device="eth1", duplex="half", cluster="mycluster"}'
            value: 1
    alert_rule_test:
     - eval_time: 5m
@@ -2576,18 +2604,19 @@ tests:
       - exp_labels:
           duplex: half
           device: eth1
+          cluster: mycluster
           severity: warning
           type: ceph_default
         exp_annotations:
-          summary: "Network interface eth1 is not running in full duplex mode "
-          description: "Until this is resolved, performance from the gateway will be degraded" 
+          summary: "Network interface eth1 is not running in full duplex mode on cluster mycluster"
+          description: "Until this is resolved, performance from the gateway will be degraded"
 
- # NVMeoFHighReadLatency 
+ # NVMeoFHighReadLatency
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_bdev_read_seconds_total{instance="node-1:10008",bdev_name="disk1"}'
-      values: '0+1680x10 19800+3000x20'   
-    - series: 'ceph_nvmeof_bdev_reads_completed_total{instance="node-1:10008",bdev_name="disk1"}'
+    - series: 'ceph_nvmeof_bdev_read_seconds_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
+      values: '0+1680x10 19800+3000x20'
+    - series: 'ceph_nvmeof_bdev_reads_completed_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
       values: '0+286000x10 2980000+120000x20'
    promql_expr_test:
      - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02
@@ -2610,14 +2639,14 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "The average read latency over the last 5 mins has reached 10 ms or more on node-1"
-          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate" 
+          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
 
- # NVMeoFHighWriteLatency 
+ # NVMeoFHighWriteLatency
  - interval: 30s
    input_series:
-    - series: 'ceph_nvmeof_bdev_write_seconds_total{instance="node-1:10008",bdev_name="disk1"}'
-      values: '0+1680x10 19800+3000x20'   
-    - series: 'ceph_nvmeof_bdev_writes_completed_total{instance="node-1:10008",bdev_name="disk1"}'
+    - series: 'ceph_nvmeof_bdev_write_seconds_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
+      values: '0+1680x10 19800+3000x20'
+    - series: 'ceph_nvmeof_bdev_writes_completed_total{instance="node-1:10008",bdev_name="disk1",cluster="mycluster"}'
       values: '0+286000x10 2980000+120000x20'
    promql_expr_test:
      - expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[1m]) / rate(ceph_nvmeof_bdev_writes_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.02
@@ -2641,4 +2670,3 @@ tests:
         exp_annotations:
           summary: "The average write latency over the last 5 mins has reached 20 ms or more on node-1"
           description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
- 
-\ No newline at end of file
diff --git a/monitoring/ceph-mixin/tox.ini b/monitoring/ceph-mixin/tox.ini
index 90ac311edba..8d1ec872e7f 100644
--- a/monitoring/ceph-mixin/tox.ini
+++ b/monitoring/ceph-mixin/tox.ini
@@ -26,6 +26,7 @@ allowlist_externals =
     jsonnet
     jsonnetfmt
     sh
+    ./lint-jsonnet.sh
 description =
     check: Ensure that auto-generated files matches the current version
     fix: Update generated files from jsonnet file with latest changes
diff --git a/qa/README b/qa/README
index f9b8988c6f9..a6a95c479bc 100644
--- a/qa/README
+++ b/qa/README
@@ -83,3 +83,8 @@ supported_distros as distros$ will be run just once: either on centos, rhel or
 ubuntu, chosen randomly.
 
 The teuthology code can be found in https://github.com/ceph/teuthology.git
+
+Note: The performance suites clone CBT from master here: https://github.com/ceph/cbt.git
+CBT will not support cosbench beyond release tag v0.3, therefore no qa suite should use cosbench.
+cosbench support has been removed from qa/tasks/cbt.py.
+
diff --git a/qa/cephfs/begin/3-kernel.yaml b/qa/cephfs/begin/3-kernel.yaml
new file mode 100644
index 00000000000..e94a0d87dc8
--- /dev/null
+++ b/qa/cephfs/begin/3-kernel.yaml
@@ -0,0 +1,23 @@
+# When the --kernel option is given to teuthology-suite, the kernel is set for
+# all nodes (also, the kernel is "distro" when the --kernel option is not set).
+# We don't generally want to use a custom kernel for all tests, so unset it.
+# The k-testing.yaml will set it, if given, for only the client nodes.
+#
+# Allow overriding this by using a branch ending in "-all".
+
+teuthology:
+  postmerge:
+    - |
+      local branch = yaml.kernel.branch
+      if branch and not yaml.kernel.branch:find "-all$" then
+        log.debug("removing default kernel specification: %s", yaml.kernel)
+        py_attrgetter(yaml.kernel).pop('branch', nil)
+        py_attrgetter(yaml.kernel).pop('deb', nil)
+        py_attrgetter(yaml.kernel).pop('flavor', nil)
+        py_attrgetter(yaml.kernel).pop('kdb', nil)
+        py_attrgetter(yaml.kernel).pop('koji', nil)
+        py_attrgetter(yaml.kernel).pop('koji_task', nil)
+        py_attrgetter(yaml.kernel).pop('rpm', nil)
+        py_attrgetter(yaml.kernel).pop('sha1', nil)
+        py_attrgetter(yaml.kernel).pop('tag', nil)
+      end
diff --git a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml
index 2ee219125e7..048cd5ce8b9 100644
--- a/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml
+++ b/qa/cephfs/mount/kclient/overrides/distro/testing/k-testing.yaml
@@ -1,3 +1,12 @@
+teuthology:
+  premerge: |
+    log.debug("base kernel %s", base_config.kernel)
+    local kernel = base_config.kernel
+    if kernel.branch ~= "distro" then
+      log.debug("overriding testing kernel with %s", kernel)
+      yaml_fragment.kernel.client = kernel
+    end
+
 kernel:
   client:
     branch: testing
diff --git a/qa/cephfs/overrides/ignorelist_health.yaml b/qa/cephfs/overrides/ignorelist_health.yaml
index 678548fe2cc..5ac25a8f790 100644
--- a/qa/cephfs/overrides/ignorelist_health.yaml
+++ b/qa/cephfs/overrides/ignorelist_health.yaml
@@ -21,3 +21,7 @@ overrides:
       - overall HEALTH_
       - Replacing daemon
       - deprecated feature inline_data
+      - BLUESTORE_SLOW_OP_ALERT
+      - slow operation indications in BlueStore
+      - experiencing slow operations in BlueStore
+      - MGR_MODULE_ERROR
diff --git a/qa/cephfs/overrides/pg_health.yaml b/qa/cephfs/overrides/pg_health.yaml
index 1740134a2e0..07ca62e01fb 100644
--- a/qa/cephfs/overrides/pg_health.yaml
+++ b/qa/cephfs/overrides/pg_health.yaml
@@ -9,3 +9,5 @@ overrides:
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh
index 4eac1106e8d..843e9b9901b 100755
--- a/qa/standalone/scrub/osd-recovery-scrub.sh
+++ b/qa/standalone/scrub/osd-recovery-scrub.sh
@@ -234,146 +234,6 @@ function wait_background_check() {
     return $return_code
 }
 
-# osd_scrub_during_recovery=true make sure scrub happens
-# update 26.8.24: the test should be redesigned. The current version is not
-# reliable, and playing around with the timeouts and such won't fix the
-# design issues.
-function TEST_recovery_scrub_2() {
-    local dir=$1
-    local poolname=test
-    return 0
-
-    TESTDATA="testdata.$$"
-    OSDS=8
-    PGS=32
-    OBJECTS=40
-
-    setup $dir || return 1
-    run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1
-    run_mgr $dir x --mgr_stats_period=1 || return 1
-    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0.1 "
-    ceph_osd_args+="--osd_scrub_backoff_ratio=0 "
-    ceph_osd_args+="--osd_stats_update_period_not_scrubbing=3 "
-    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 "
-    ceph_osd_args+="--mgr_stats_period=1"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        run_osd $dir $osd --osd_scrub_during_recovery=true --osd_recovery_sleep=1 \
-                          $ceph_osd_args || return 1
-    done
-
-    # Create a pool with $PGS pgs
-    create_pool $poolname $PGS $PGS
-    wait_for_clean || return 1
-    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
-
-    dd if=/dev/urandom of=$TESTDATA bs=1M count=50
-    for i in $(seq 1 $OBJECTS)
-    do
-        rados -p $poolname put obj${i} $TESTDATA
-    done
-    rm -f $TESTDATA
-
-    ceph osd pool set $poolname size 3
-
-    ceph pg dump pgs
-
-    # note that the following will be needed if the mclock scheduler is specified
-    ceph tell osd.* config get osd_mclock_override_recovery_settings
-
-    # the '_max_active' is expected to be 0
-    ceph tell osd.1 config get osd_recovery_max_active
-    # both next parameters are expected to be >=3
-    ceph tell osd.1 config set osd_recovery_max_active_hdd 6
-    ceph tell osd.1 config set osd_recovery_max_active_ssd 6
-    ceph tell osd.1 config get osd_recovery_max_active_hdd
-    ceph tell osd.1 config get osd_recovery_max_active_ssd
-
-    # Wait for recovery to start
-    count=0
-    while(true)
-    do
-      #ceph --format json pg dump pgs | jq '.pg_stats | [.[].state]'
-      ceph pg dump pgs
-      if test $(ceph --format json pg dump pgs |
-	      jq '.pg_stats | [.[].state]'| grep recovering | wc -l) -ge 2
-      then
-        break
-      fi
-      sleep 2
-      if test "$count" -eq "10"
-      then
-        echo "Not enough recovery started simultaneously"
-        return 1
-      fi
-      count=$(expr $count + 1)
-    done
-    ceph pg dump pgs
-
-    pids=""
-    recov_scrub_count=0
-    for pg in $(seq 0 $(expr $PGS - 1))
-    do
-        run_in_background pids pg_scrub_mod $poolid.$(printf "%x" $pg)
-    done
-    wait_background_check pids
-    return_code=$?
-    if [ $return_code -ne 0 ]; then return $return_code; fi
-
-    ERRORS=0
-    if test $recov_scrub_count -eq 0
-    then
-      echo "No scrubs occurred while PG recovering"
-      ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
-    pid=$(cat $pidfile)
-    if ! kill -0 $pid
-    then
-        echo "OSD crash occurred"
-        #tail -100 $dir/osd.0.log
-        ERRORS=$(expr $ERRORS + 1)
-    fi
-
-    # Work around for http://tracker.ceph.com/issues/38195
-    kill_daemons $dir #|| return 1
-
-    declare -a err_strings
-    ## we do not expect a refusal to scrub
-    err_strings[0]="recovery in progress.*scrubs"
-    for osd in $(seq 0 $(expr $OSDS - 1))
-    do
-        grep "recovery in progress.*scrubs" $dir/osd.${osd}.log
-    done
-    for err_string in "${err_strings[@]}"
-    do
-        found=false
-        for osd in $(seq 0 $(expr $OSDS - 1))
-        do
-            if grep "$err_string" $dir/osd.${osd}.log > /dev/null;
-            then
-                found=true
-            fi
-        done
-        if [ "$found" = "true" ]; then
-            echo "Found log message not expected '$err_string'"
-	    ERRORS=$(expr $ERRORS + 1)
-        fi
-    done
-
-    teardown $dir || return 1
-
-    if [ $ERRORS != "0" ];
-    then
-        echo "TEST FAILED WITH $ERRORS ERRORS"
-        return 1
-    fi
-
-    echo "TEST PASSED"
-    return 0
-}
-
 main osd-recovery-scrub "$@"
 
 # Local Variables:
diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 59564f7e37e..491e46603f7 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -442,7 +442,6 @@ function TEST_auto_repair_bluestore_basic() {
         ['pool_name']="testpool"
         ['extras']=" --osd_scrub_auto_repair=true"
     )
-    local extr_dbg=3
     standard_scrub_cluster $dir cluster_conf
     local poolid=${cluster_conf['pool_id']}
     local poolname=${cluster_conf['pool_name']}
@@ -6252,6 +6251,254 @@ function TEST_request_scrub_priority() {
     grep "log_channel.*scrub ok" $dir/osd.${primary}.log | grep -v purged_snaps | head -1 | sed 's/.*[[]DBG[]]//' | grep -q $pg || return 1
 }
 
+#
+# Testing the "split scrub store" feature: shallow scrubs do not
+# purge deep errors from the store.
+#
+# Corrupt one copy of a replicated pool, creating both shallow and deep errors.
+# Then shallow-scrub the pool and verify that the deep errors are still present.
+#
+function TEST_dual_store_replicated_cluster() {
+    local dir=$1
+    local poolname=csr_pool
+    local total_objs=19
+    local extr_dbg=1 # note: 3 and above leave some temp files around
+
+    run_mon $dir a --osd_pool_default_size=2 || return 1
+    run_mgr $dir x --mgr_stats_period=1 || return 1
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0 --osd_stats_update_period_not_scrubbing=3 "
+    ceph_osd_args+="--osd_stats_update_period_scrubbing=2 --osd_op_queue=wpq --osd_scrub_auto_repair=0 "
+    for osd in $(seq 0 1)
+    do
+      run_osd $dir $osd $ceph_osd_args || return 1
+    done
+
+    create_rbd_pool || return 1
+    wait_for_clean || return 1
+
+    create_pool foo 1 || return 1
+    create_pool $poolname 1 1 || return 1
+    wait_for_clean || return 1
+
+    ceph osd pool set $poolname noscrub 1
+    ceph osd pool set $poolname nodeep-scrub 1
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+        add_something $dir $poolname $objname || return 1
+
+        rados --pool $poolname setomapheader $objname hdr-$objname || return 1
+        rados --pool $poolname setomapval $objname key-$objname val-$objname || return 1
+    done
+
+    # Increase file 1 MB + 1KB
+    dd if=/dev/zero of=$dir/new.ROBJ19 bs=1024 count=1025
+    rados --pool $poolname put $objname $dir/new.ROBJ19 || return 1
+    rm -f $dir/new.ROBJ19
+
+    local pg=$(get_pg $poolname ROBJ0)
+    local primary=$(get_primary $poolname ROBJ0)
+
+    # Compute an old omap digest and save oi
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    CEPH_ARGS='' ceph daemon $(get_asok_path osd.1) \
+        config set osd_deep_scrub_update_digest_min_age 0
+    pg_deep_scrub $pg
+
+    for i in $(seq 1 $total_objs) ; do
+        objname=ROBJ${i}
+
+        # Alternate corruption between osd.0 and osd.1
+        local osd=$(expr $i % 2)
+
+        case $i in
+        1)
+            # Size (deep scrub data_digest too)
+            local payload=UVWXYZZZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        2)
+            # digest (deep scrub only)
+            local payload=UVWXYZ
+            echo $payload > $dir/CORRUPT
+            objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1
+            ;;
+
+        3)
+             # missing
+             objectstore_tool $dir $osd $objname remove || return 1
+             ;;
+
+         4)
+             # Modify omap value (deep scrub only)
+             objectstore_tool $dir $osd $objname set-omap key-$objname $dir/CORRUPT || return 1
+             ;;
+
+         5)
+            # Delete omap key (deep scrub only)
+            objectstore_tool $dir $osd $objname rm-omap key-$objname || return 1
+            ;;
+
+         6)
+            # Add extra omap key (deep scrub only)
+            echo extra > $dir/extra-val
+            objectstore_tool $dir $osd $objname set-omap key2-$objname $dir/extra-val || return 1
+            rm $dir/extra-val
+            ;;
+
+         7)
+            # Modify omap header (deep scrub only)
+            echo -n newheader > $dir/hdr
+            objectstore_tool $dir $osd $objname set-omaphdr $dir/hdr || return 1
+            rm $dir/hdr
+            ;;
+
+         8)
+            rados --pool $poolname setxattr $objname key1-$objname val1-$objname || return 1
+            rados --pool $poolname setxattr $objname key2-$objname val2-$objname || return 1
+
+            # Break xattrs
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir $osd $objname set-attr _key1-$objname $dir/bad-val || return 1
+            objectstore_tool $dir $osd $objname rm-attr _key2-$objname || return 1
+            echo -n val3-$objname > $dir/newval
+            objectstore_tool $dir $osd $objname set-attr _key3-$objname $dir/newval || return 1
+            rm $dir/bad-val $dir/newval
+            ;;
+
+        9)
+            objectstore_tool $dir $osd $objname get-attr _ > $dir/robj9-oi
+            echo -n D > $dir/change
+            rados --pool $poolname put $objname $dir/change
+            objectstore_tool $dir $osd $objname set-attr _ $dir/robj9-oi
+            rm $dir/oi $dir/change
+            ;;
+
+          # ROBJ10 must be handled after digests are re-computed by a deep scrub below
+          # ROBJ11 must be handled with config change before deep scrub
+          # ROBJ12 must be handled with config change before scrubs
+          # ROBJ13 must be handled before scrubs
+
+        14)
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 0 $objname set-attr _ $dir/bad-val || return 1
+            objectstore_tool $dir 1 $objname rm-attr _ || return 1
+            rm $dir/bad-val
+            ;;
+
+        15)
+            objectstore_tool $dir $osd $objname rm-attr _ || return 1
+            ;;
+
+        16)
+            objectstore_tool $dir 0 $objname rm-attr snapset || return 1
+            echo -n bad-val > $dir/bad-val
+            objectstore_tool $dir 1 $objname set-attr snapset $dir/bad-val || return 1
+	    ;;
+
+	17)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ17
+           echo $payload > $dir/new.ROBJ17
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ17 || return 1
+	   ;;
+
+	18)
+	    # Deep-scrub only (all replicas are diffent than the object info
+           local payload=ROBJ18
+           echo $payload > $dir/new.ROBJ18
+	   objectstore_tool $dir 0 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   objectstore_tool $dir 1 $objname set-bytes $dir/new.ROBJ18 || return 1
+	   # Make one replica have a different object info, so a full repair must happen too
+	   objectstore_tool $dir $osd $objname corrupt-info || return 1
+	   ;;
+
+	19)
+	   # Set osd-max-object-size smaller than this object's size
+
+        esac
+    done
+
+    local pg=$(get_pg $poolname ROBJ0)
+
+    ceph tell osd.\* injectargs -- --osd-max-object-size=1048576
+
+    inject_eio rep data $poolname ROBJ11 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+    inject_eio rep mdata $poolname ROBJ12 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+    inject_eio rep data $poolname ROBJ13 $dir 0 || return 1 # shard 0 of [1, 0], osd.1
+
+    # first sequence: the final shallow scrub should not override any of the deep errors
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1.json
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_1b.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh1_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_1b_s.json
+
+    pg_deep_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_2.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dp_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_2s.json
+
+    pg_scrub $pg
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_3.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/sh2_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_3s.json
+
+    diff -u $dir/dp_results.json $dir/sh2_results.json || return 1
+
+    # inject a read error, which is a special case: the scrub encountering the read error
+    # would override the previously collected shard info.
+    inject_eio rep mdata $poolname ROBJ13 $dir 1 || return 1 # shard 1 of [1, 0], osd.0
+
+    pg_deep_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_4.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_4s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > $dir/dpPart2_w13_results.json
+    # Remove the entry with "name":"ROBJ13" from the $dir/d*_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' | \
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/dpPart2_wo13_results.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_4s.json
+
+    pg_scrub $pg
+
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | python3 -c "$sortkeys" | jq '.'  > /tmp/WQR_5.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | \
+        python3 -c "$sortkeys" > /tmp/WQR_5s_w13.json
+    (( extr_dbg >= 3 )) && rados list-inconsistent-obj $pg | jq "$jqfilter" | \
+        jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > /tmp/WQR_5s_wo13.json
+
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq '.inconsistents' | python3 -c "$sortkeys" > \
+        $dir/sh2Part2_w13_results.json
+    rados list-inconsistent-obj $pg | jq "$jqfilter" | jq 'del(.inconsistents[] | select(.object.name == "ROBJ13"))' |\
+        jq '.inconsistents' | python3 -c "$sortkeys" > $dir/shPart2_wo13_results.json
+
+    # the shallow scrub results should differ from the results of the deep
+    # scrub preceding it, but the difference should be limited to ROBJ13
+    diff -u $dir/dpPart2_w13_results.json $dir/sh2Part2_w13_results.json && return 1
+    diff -u $dir/dpPart2_wo13_results.json $dir/shPart2_wo13_results.json || return 1
+
+    ceph osd pool rm $poolname $poolname --yes-i-really-really-mean-it
+    return 0
+}
+
 
 main osd-scrub-repair "$@"
 
diff --git a/qa/suites/crimson-rados/perf/deploy/ceph.yaml b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
index 0f6021975a4..50d170f5022 100644
--- a/qa/suites/crimson-rados/perf/deploy/ceph.yaml
+++ b/qa/suites/crimson-rados/perf/deploy/ceph.yaml
@@ -10,3 +10,4 @@ tasks:
       osd:
         debug monc: 20
     flavor: crimson
+- ssh_keys:
diff --git a/qa/suites/fs/libcephfs/tasks/client.yaml b/qa/suites/fs/libcephfs/tasks/client.yaml
index da841373220..42ca9336c8e 100644
--- a/qa/suites/fs/libcephfs/tasks/client.yaml
+++ b/qa/suites/fs/libcephfs/tasks/client.yaml
@@ -12,3 +12,4 @@ tasks:
     clients:
       client.0:
         - client/test.sh
+        - client/test_oc_disabled.sh
diff --git a/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/featureful_client/old_client/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/featureful_client/upgraded_client/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
index 713adb9628a..96e4353e99c 100644
--- a/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
+++ b/qa/suites/fs/upgrade/mds_upgrade_sequence/overrides/ignorelist_upgrade.yaml
@@ -2,3 +2,4 @@ overrides:
   ceph:
     log-ignorelist:
       - OSD_DOWN
+      - osd.*is down
diff --git a/qa/suites/fs/upgrade/nofs/kernel.yaml b/qa/suites/fs/upgrade/nofs/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/nofs/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/upgrade/upgraded_client/kernel.yaml b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/upgrade/upgraded_client/kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/workload/begin/3-kernel.yaml b/qa/suites/fs/workload/begin/3-kernel.yaml
new file mode 120000
index 00000000000..a7f7b735665
--- /dev/null
+++ b/qa/suites/fs/workload/begin/3-kernel.yaml
@@ -0,0 +1 @@
+.qa/cephfs/begin/3-kernel.yaml
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
index 2e4741e8140..7c97edae552 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml
new file mode 100644
index 00000000000..8eb4f6dc63c
--- /dev/null
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_mtls.yaml
@@ -0,0 +1,36 @@
+tasks:
+- nvmeof:
+    installer: host.a
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    rbd:
+      pool_name: mypool
+      image_name_prefix: myimage
+    gateway_config:
+      subsystems_count: 3
+      namespaces_count: 20
+      cli_image: quay.io/ceph/nvmeof-cli:latest
+      create_mtls_secrets: true
+
+- cephadm.wait_for_service:
+    service: nvmeof.mypool.mygroup0
+
+- workunit:
+    no_coverage_and_limits: true
+    timeout: 30m
+    clients:
+      client.0:
+        - nvmeof/setup_subsystem.sh
+        - nvmeof/basic_tests.sh
+        - nvmeof/fio_test.sh --rbd_iostat
+    env:
+      RBD_POOL: mypool
+      RBD_IMAGE_PREFIX: myimage
+      IOSTAT_INTERVAL: '10'
+      RUNTIME: '60'
+
+- workunit:
+    no_coverage_and_limits: true
+    timeout: 30m
+    clients:
+      client.0:
+        - nvmeof/mtls_test.sh
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
index 2e873a04bab..9ef37004427 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_namespaces.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
index 83d16e4cb2c..12cb50b408d 100644
--- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
index 6db0c0d4e18..b4755a6433b 100644
--- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
@@ -1,14 +1,14 @@
 tasks:
 - nvmeof:
     installer: host.a
-    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    gw_image: quay.io/ceph/nvmeof:latest # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
     rbd:
       pool_name: mypool
       image_name_prefix: myimage
     gateway_config:
       subsystems_count: 3
       namespaces_count: 20 # each subsystem
-      cli_image: quay.io/ceph/nvmeof-cli:1.2
+      cli_image: quay.io/ceph/nvmeof-cli:latest
 
 - cephadm.wait_for_service:
     service: nvmeof.mypool.mygroup0
diff --git a/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml
new file mode 100644
index 00000000000..3bbf30ea427
--- /dev/null
+++ b/qa/suites/orch/cephadm/smb/tasks/deploy_smb_mgr_clustering_ips.yaml
@@ -0,0 +1,91 @@
+roles:
+# Test is for basic smb deployment & functionality. one node cluster is OK
+- - host.a
+  - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - client.0
+- - host.b
+  - mon.b
+  - osd.2
+  - osd.3
+- - host.c
+  - mon.c
+  - osd.4
+  - osd.5
+# Reserve a host for acting as a domain controller and smb client
+- - host.d
+  - cephadm.exclude
+overrides:
+  ceph:
+    log-only-match:
+      - CEPHADM_
+tasks:
+- cephadm.configure_samba_client_container:
+    role: host.d
+- vip:
+    count: 1
+- cephadm:
+
+- cephadm.shell:
+    host.a:
+      - ceph fs volume create cephfs
+- cephadm.wait_for_service:
+    service: mds.cephfs
+
+- cephadm.shell:
+    host.a:
+      # add subvolgroup & subvolumes for test
+      - cmd: ceph fs subvolumegroup create cephfs smb
+      - cmd: ceph fs subvolume create cephfs sv1 --group-name=smb --mode=0777
+      - cmd: ceph fs subvolume create cephfs sv2 --group-name=smb --mode=0777
+      # set up smb cluster and shares
+      - cmd: ceph mgr module enable smb
+      - cmd: sleep 30
+      - cmd: >
+          ceph smb cluster create modusr1 user
+          --define-user-pass=user1%t3stP4ss1
+          --placement=count:3
+          --clustering=default
+          --public_addrs={{VIP0}}/{{VIPPREFIXLEN}}
+      - cmd: ceph smb share create modusr1 share1 cephfs / --subvolume=smb/sv1
+      - cmd: ceph smb share create modusr1 share2 cephfs / --subvolume=smb/sv2
+# Wait for the smb service to start
+- cephadm.wait_for_service:
+    service: smb.modusr1
+
+# Check if shares exist
+- cephadm.exec:
+    host.d:
+      - sleep 30
+      - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share1 -c ls"
+      - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{'host.a'|role_to_remote|attr('ip_address')}}/share2 -c ls"
+
+# verify CTDB is healthy, cluster well formed
+- cephadm.exec:
+    host.a:
+      - "{{ctx.cephadm}} ls --no-detail  | {{ctx.cephadm}} shell jq -r 'map(select(.name | startswith(\"smb.modusr1\")))[-1].name' > /tmp/svcname"
+      - "{{ctx.cephadm}} enter -n $(cat /tmp/svcname) ctdb status > /tmp/ctdb_status"
+      - cat /tmp/ctdb_status
+      - grep 'pnn:0 .*OK' /tmp/ctdb_status
+      - grep 'pnn:1 .*OK' /tmp/ctdb_status
+      - grep 'pnn:2 .*OK' /tmp/ctdb_status
+      - grep 'Number of nodes:3' /tmp/ctdb_status
+      - rm -rf /tmp/svcname /tmp/ctdb_status
+
+# Test the assigned VIP
+- cephadm.exec:
+    host.d:
+      - sleep 30
+      - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share1 -c ls"
+      - "{{ctx.samba_client_container_cmd|join(' ')}} smbclient -U user1%t3stP4ss1 //{{VIP0}}/share2 -c ls"
+
+- cephadm.shell:
+    host.a:
+      - cmd: ceph smb share rm modusr1 share2
+      - cmd: ceph smb share rm modusr1 share1
+      - cmd: ceph smb cluster rm modusr1
+# Wait for the smb service to be removed
+- cephadm.wait_for_service_not_present:
+    service: smb.modusr1
diff --git a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
index 0080d3bf730..c6bec082843 100644
--- a/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
+++ b/qa/suites/orch/cephadm/upgrade/3-upgrade/staggered.yaml
@@ -131,8 +131,10 @@ tasks:
       - ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --services rgw.foo
       - while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done
       - ceph orch ps
+      - ceph versions
       # verify all rgw daemons on same version and version hash matches what we are upgrading to
-      - ceph versions | jq -e '.rgw | length == 1'
+      # `ceph versions` might not get updated immediately for rgw so retry this
+      - time timeout 60 bash -c "until ceph versions | jq -e '.rgw | length == 1'; do sleep 2; done"
       - ceph versions | jq -e '.rgw | keys' | grep $sha1
       - ceph orch upgrade status
       - ceph health detail
diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
new file mode 100644
index 00000000000..5207fd415b7
--- /dev/null
+++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
@@ -0,0 +1,77 @@
+overrides:
+  ceph:
+    log-ignorelist:
+      - CEPHADM_FAILED_DAEMON
+    log-only-match:
+      - CEPHADM_
+roles:
+- - host.a
+  - mon.a
+  - mgr.a
+  - osd.0
+- - host.b
+  - mon.b
+  - mgr.b
+  - osd.1
+- - host.c
+  - mon.c
+  - osd.2
+tasks:
+- install:
+- cephadm:
+- cephadm.shell:
+    host.c:
+      - |
+        set -ex
+        # Deploy monitoring stack
+        ceph orch apply node-exporter
+        ceph orch apply grafana
+        ceph orch apply alertmanager
+        ceph orch apply prometheus
+        sleep 240
+        # generate SSL certificate
+        openssl req -x509 -newkey rsa:4096 -keyout /tmp/key.pem -out /tmp/cert.pem -sha256 -days 30 -nodes -subj "/CN=*"
+        # Generate a mgmt.spec template
+        cat << EOT > /tmp/mgmt.spec
+        service_type: mgmt-gateway
+        service_id: foo
+        placement:
+          hosts:
+            - ${HOSTNAME}
+        spec:
+          ssl_protocols:
+            - TLSv1.2
+            - TLSv1.3
+          ssl_ciphers:
+            - AES128-SHA
+            - AES256-SHA
+          enable_health_check_endpoint: True
+        EOT
+        # Add generated certificates to spec file
+        echo "  ssl_certificate: |" >> /tmp/mgmt.spec 
+        while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
+        echo "  ssl_certificate_key: |" >> /tmp/mgmt.spec
+        while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/key.pem >> /tmp/mgmt.spec
+        # Apply spec
+        ceph orch apply -i /tmp/mgmt.spec
+- cephadm.wait_for_service:
+    service: mgmt-gateway
+- cephadm.shell:
+    host.a:
+      - |
+        set -ex
+        # retrieve mgmt hostname and ip
+        MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
+        MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
+        # check mgmt-gateway health
+        curl -k -s https://${MGMT_GTW_IP}/health
+        curl -k -s https://${MGMT_GTW_IP}:29443/health
+        # wait for background services to be reconfigured following mgmt-gateway installation
+        sleep 180
+        # check grafana endpoints are responsive and database health is okay
+        curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
+        # check prometheus endpoints are responsive
+        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
+        # check alertmanager endpoints are responsive
+        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
+
diff --git a/qa/suites/rbd/iscsi/0-single-container-host.yaml b/qa/suites/rbd/iscsi/0-single-container-host.yaml
deleted file mode 120000
index 7406e749cf5..00000000000
--- a/qa/suites/rbd/iscsi/0-single-container-host.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/single-container-host.yaml
-\ No newline at end of file
diff --git a/qa/suites/rbd/iscsi/base/install.yaml b/qa/suites/rbd/iscsi/base/install.yaml
index 5c5a6c31f60..cca178cafe8 100644
--- a/qa/suites/rbd/iscsi/base/install.yaml
+++ b/qa/suites/rbd/iscsi/base/install.yaml
@@ -9,6 +9,10 @@ tasks:
     - ceph orch host ls
     - ceph orch device ls
 - install:
-    extra_packages:
+    extra_system_packages:
+      deb:
+      - open-iscsi
+      - multipath-tools
+      rpm:
       - iscsi-initiator-utils
       - device-mapper-multipath
diff --git a/qa/suites/rbd/iscsi/supported-container-hosts$ b/qa/suites/rbd/iscsi/supported-container-hosts$
new file mode 120000
index 00000000000..30a61f1575f
--- /dev/null
+++ b/qa/suites/rbd/iscsi/supported-container-hosts$
@@ -0,0 +1 @@
+.qa/distros/supported-container-hosts/
+\ No newline at end of file
diff --git a/qa/suites/rgw/multifs/0-install.yaml b/qa/suites/rgw/multifs/0-install.yaml
new file mode 100644
index 00000000000..7e83140e64a
--- /dev/null
+++ b/qa/suites/rgw/multifs/0-install.yaml
@@ -0,0 +1,5 @@
+tasks:
+- install:
+- ceph:
+- rgw: [client.0]
+- tox: [client.0]
diff --git a/qa/suites/rgw/multifs/tasks/+ b/qa/suites/rgw/multifs/tasks/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rgw/multifs/tasks/+
diff --git a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
index e07c8b5ccfe..d9526c365c1 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_bucket_quota.yaml
@@ -1,13 +1,5 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
 - workunit:
     clients:
       client.0:
         - rgw/s3_bucket_quota.pl
-overrides:
-  ceph:
-    conf:
-      client:
-        rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
index bac4f401626..ae32e928661 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_multipart_upload.yaml
@@ -1,13 +1,5 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
 - workunit:
     clients:
       client.0:
         - rgw/s3_multipart_upload.pl
-overrides:
-  ceph:
-    conf:
-      client:
-        rgw relaxed s3 bucket names: true
diff --git a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
index 66bdff817f5..184555660dc 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_ragweed.yaml
@@ -1,8 +1,4 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- tox: [client.0]
 - ragweed:
     client.0:
       default-branch: ceph-master
diff --git a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
index 92355f04963..573cffbc30a 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_s3tests.yaml
@@ -1,8 +1,4 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
-- tox: [client.0]
 - s3tests:
     client.0:
       rgw_server: client.0
diff --git a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
index 92c63d2e850..393180e5c17 100644
--- a/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
+++ b/qa/suites/rgw/multifs/tasks/rgw_user_quota.yaml
@@ -1,13 +1,5 @@
 tasks:
-- install:
-- ceph:
-- rgw: [client.0]
 - workunit:
     clients:
       client.0:
         - rgw/s3_user_quota.pl
-overrides:
-  ceph:
-    conf:
-      client:
-        rgw relaxed s3 bucket names: true
diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 3814ea3efdb..146bd57960d 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -31,3 +31,5 @@ overrides:
     conf:
       osd:
         osd shutdown pgref assert: true
+    log-ignorelist:
+        - PG_DEGRADED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index bf3005fad45..ce4e0cc228b 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -6,6 +6,7 @@ overrides:
       - MON_DOWN
       - out of quorum
       - PG_AVAILABILITY
+      - PG_DEGRADED
 tasks:
 - install:
     branch: reef
diff --git a/qa/tasks/cbt.py b/qa/tasks/cbt.py
index 84e096520b4..e6a9dc8223c 100644
--- a/qa/tasks/cbt.py
+++ b/qa/tasks/cbt.py
@@ -47,22 +47,11 @@ class CBT(Task):
 
         benchmark_config = self.config.get('benchmarks')
         benchmark_type = next(iter(benchmark_config.keys()))
+  
         if benchmark_type in ['librbdfio', 'fio']:
           testdir = misc.get_testdir(self.ctx)
           benchmark_config[benchmark_type]['cmd_path'] = os.path.join(testdir, 'fio/fio')
-        if benchmark_type == 'cosbench':
-            # create cosbench_dir and cosbench_xml_dir
-            testdir = misc.get_testdir(self.ctx)
-            benchmark_config['cosbench']['cosbench_dir'] = os.path.join(testdir, 'cos')
-            benchmark_config['cosbench']['cosbench_xml_dir'] = os.path.join(testdir, 'xml')
-            self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', benchmark_config['cosbench']['cosbench_xml_dir']])
-            benchmark_config['cosbench']['controller'] = osd_hosts[0]
-
-            # set auth details
-            remotes_and_roles = self.ctx.cluster.remotes.items()
-            ips = [host for (host, port) in
-                   (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
-            benchmark_config['cosbench']['auth'] = "username=cosbench:operator;password=intel2012;url=http://%s:80/auth/v1.0;retry=9" %(ips[0])
+  
         client_endpoints_config = self.config.get('client_endpoints', None)
         monitoring_profiles = self.config.get('monitoring_profiles', {})
 
@@ -117,77 +106,6 @@ class CBT(Task):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            # install cosbench
-            self.log.info('install dependencies for cosbench')
-            if system_type == 'rpm':
-                cosbench_depends = ['wget', 'unzip', 'java-1.7.0-openjdk', 'curl']
-            else:
-                cosbench_depends = ['wget', 'unzip', 'openjdk-8-jre', 'curl']
-            self.first_mon.run(args=install_cmd + cosbench_depends)
-            testdir = misc.get_testdir(self.ctx)
-            cosbench_version = '0.4.2.c3'
-            cosbench_location = 'https://github.com/intel-cloud/cosbench/releases/download/v0.4.2.c3/0.4.2.c3.zip'
-            os_version = misc.get_system_type(self.first_mon, False, True)
-
-            # additional requirements for bionic
-            if os_version == '18.04':
-                self.first_mon.run(
-                    args=['sudo', 'apt-get', '-y', 'purge', 'openjdk-11*'])
-                # use our own version of cosbench
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-                # contains additional parameter "-N" to nc
-                cosbench_location = 'http://drop.ceph.com/qa/cosbench-0.4.2.c3.1.zip'
-                cosbench_dir = os.path.join(testdir, cosbench_version)
-                self.ctx.cluster.run(args=['mkdir', '-p', '-m0755', '--', cosbench_dir])
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version), '-d', cosbench_version
-                    ]
-                )
-            else:
-                self.first_mon.run(
-                    args=[
-                        'cd', testdir, run.Raw('&&'),
-                        'wget',
-                        cosbench_location, run.Raw('&&'),
-                        'unzip', '{name}.zip'.format(name=cosbench_version)
-                    ]
-                )
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'ln', '-s', cosbench_version, 'cos',
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'cd', os.path.join(testdir, 'cos'), run.Raw('&&'),
-                    'chmod', '+x', run.Raw('*.sh'),
-                ]
-            )
-
-            # start cosbench and check info
-            self.log.info('start cosbench')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'start-all.sh'
-                ]
-            )
-            self.log.info('check cosbench info')
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'cli.sh', 'info'
-                ]
-            )
-
     def checkout_cbt(self):
         testdir = misc.get_testdir(self.ctx)
         repo = self.config.get('repo', 'https://github.com/ceph/cbt.git')
@@ -269,51 +187,6 @@ class CBT(Task):
                 ]
             )
 
-        if benchmark_type == 'cosbench':
-            os_version = misc.get_system_type(self.first_mon, False, True)
-            if os_version == '18.04':
-                cosbench_version = 'cosbench-0.4.2.c3.1'
-            else:
-                cosbench_version = '0.4.2.c3'
-            # note: stop-all requires 'nc'
-            self.first_mon.run(
-                args=[
-                    'cd', testdir, run.Raw('&&'),
-                    'cd', 'cos', run.Raw('&&'),
-                    'sh', 'stop-all.sh',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'sudo', 'killall', '-9', 'java',
-                    run.Raw('||'), 'true'
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/cos'.format(tdir=testdir),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/{version}.zip'.format(tdir=testdir, version=cosbench_version),
-                ]
-            )
-            self.first_mon.run(
-                args=[
-                    'rm', '--one-file-system', '-rf', '--',
-                    '{tdir}/xml'.format(tdir=testdir),
-                ]
-            )
         # Collect cbt performance data
         cbt_performance = CBTperformance()
         cbt_performance.collect(self.ctx, self.config)
diff --git a/qa/tasks/ceph_iscsi_client.py b/qa/tasks/ceph_iscsi_client.py
index 189b7fa31fe..0b0a355f925 100644
--- a/qa/tasks/ceph_iscsi_client.py
+++ b/qa/tasks/ceph_iscsi_client.py
@@ -31,8 +31,15 @@ def task(ctx, config):
         remote.run(args=['sudo', 'systemctl', 'restart', 'iscsid'])
 
         remote.run(args=['sudo', 'modprobe', 'dm_multipath'])
-        remote.run(args=['sudo', 'mpathconf', '--enable'])
         conf = dedent('''
+        defaults {
+                user_friendly_names yes
+                find_multipaths yes
+        }
+
+        blacklist {
+        }
+
         devices {
                 device {
                         vendor                 "LIO-ORG"
@@ -50,7 +57,7 @@ def task(ctx, config):
         }
         ''')
         path = "/etc/multipath.conf"
-        remote.sudo_write_file(path, conf, append=True)
+        remote.sudo_write_file(path, conf)
         remote.run(args=['sudo', 'systemctl', 'start', 'multipathd'])
 
     yield
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 1c00a49077d..2b7fd2ee569 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -640,8 +640,11 @@ class FilesystemBase(MDSClusterBase):
     def set_joinable(self, joinable=True):
         self.set_var("joinable", joinable)
 
-    def set_max_mds(self, max_mds):
-        self.set_var("max_mds", "%d" % max_mds)
+    def set_max_mds(self, max_mds, confirm=True):
+        if confirm:
+            self.set_var('max_mds', f'{max_mds}', '--yes-i-really-mean-it')
+        else:
+            self.set_var("max_mds", f"{max_mds}",)
 
     def set_session_timeout(self, timeout):
         self.set_var("session_timeout", "%d" % timeout)
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index ec06e38d78e..beb41019e6d 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -324,6 +324,8 @@ class TestFsStatus(TestAdminCommands):
     Test "ceph fs status subcommand.
     """
 
+    MDSS_REQUIRED = 3
+
     def test_fs_status(self):
         """
         That `ceph fs status` command functions.
@@ -338,6 +340,31 @@ class TestFsStatus(TestAdminCommands):
         mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"]
         self.assertEqual(mdsmap[0]["state"], "active")
 
+    def test_fs_status_standby_replay(self):
+        """
+        That `ceph fs status` command functions.
+        """
+
+        self.fs.set_allow_standby_replay(True)
+
+        s = self.get_ceph_cmd_stdout("fs", "status")
+        self.assertTrue("active" in s)
+        self.assertTrue("standby-replay" in s)
+        self.assertTrue("0-s" in s)
+        self.assertTrue("standby" in s)
+
+        mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json-pretty"))["mdsmap"]
+        self.assertEqual(mdsmap[0]["state"], "active")
+        self.assertEqual(mdsmap[1]["state"], "standby-replay")
+        self.assertEqual(mdsmap[1]["rank"], "0-s")
+        self.assertEqual(mdsmap[2]["state"], "standby")
+
+        mdsmap = json.loads(self.get_ceph_cmd_stdout("fs", "status", "--format=json"))["mdsmap"]
+        self.assertEqual(mdsmap[0]["state"], "active")
+        self.assertEqual(mdsmap[1]["state"], "standby-replay")
+        self.assertEqual(mdsmap[1]["rank"], "0-s")
+        self.assertEqual(mdsmap[2]["state"], "standby")
+
 
 class TestAddDataPool(TestAdminCommands):
     """
@@ -2656,3 +2683,241 @@ class TestMDSFail(TestAdminCommands):
                               errmsgs=health_warn)
         self.run_ceph_cmd(f'mds fail {mds1_id} --yes-i-really-mean-it')
         self.run_ceph_cmd(f'mds fail {mds2_id} --yes-i-really-mean-it')
+
+
+class TestFSSetMaxMDS(TestAdminCommands):
+
+    def test_when_unhealthy_without_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>" without the
+        confirmation flag (--yes-i-really-mean-it) fails when cluster is
+        unhealthy.
+        '''
+        self.gen_health_warn_mds_cache_oversized()
+
+        with self.assertRaises(CommandFailedError) as cfe:
+            self.fs.set_max_mds(2, confirm=False)
+        self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+    def test_when_unhealthy_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully when cluster is unhealthy.
+        '''
+        self.gen_health_warn_mds_cache_oversized()
+
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+    def test_when_mds_trim_without_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>" without the
+        confirmation flag (--yes-i-really-mean-it) fails when cluster has
+        MDS_TRIM health warning.
+        '''
+        self.gen_health_warn_mds_trim()
+
+        with self.assertRaises(CommandFailedError) as cfe:
+            self.fs.set_max_mds(2, confirm=False)
+        self.assertEqual(cfe.exception.exitstatus, errno.EPERM)
+
+    def test_when_mds_trim_when_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully when cluster has MDS_TRIM
+        health warning.
+        '''
+        self.gen_health_warn_mds_trim()
+
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+    def test_when_healthy_with_confirm(self):
+        '''
+        Test that command "ceph fs set <fsname> max_mds <num>
+        --yes-i-really-mean-it" runs successfully also when cluster is
+        healthy.
+        '''
+        self.fs.set_max_mds(2, confirm=True)
+        self.assertEqual(self.fs.get_var('max_mds'), 2)
+
+
+class TestToggleVolumes(CephFSTestCase):
+    '''
+    Contains code for enabling/disabling mgr/volumes plugin.
+    '''
+
+    VOL_MOD_NAME = 'volumes'
+    CONFIRM = '--yes-i-really-mean-it'
+
+    def tearDown(self):
+        '''
+        Ensure that the volumes plugin is enabled after the test has finished
+        running since not doing so might affect tearDown() of CephFSTestCase or
+        other superclasses.
+        '''
+        json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+        json_output = json.loads(json_output)
+
+        if 'volumes' in json_output['force_disabled_modules']:
+            self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
+
+        super(TestToggleVolumes, self).tearDown()
+
+    def test_force_disable_with_confirmation(self):
+        '''
+        Test that running "ceph mgr module force disable volumes
+        --yes-i-really-mean-it" successfully disables volumes plugin.
+
+        Also test "ceph mgr module ls" output after this.
+        '''
+        self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+                          f'{self.CONFIRM}')
+
+        json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+        json_output = json.loads(json_output)
+
+        self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+        self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+
+    def test_force_disable_fails_without_confirmation(self):
+        '''
+        Test that running "ceph mgr module force disable volumes" fails with
+        EPERM when confirmation flag is not passed along.
+
+        Also test that output of this command suggests user to pass
+        --yes-i-really-mean-it.
+        '''
+        proc = self.run_ceph_cmd(
+            f'mgr module force disable {self.VOL_MOD_NAME}',
+            stderr=StringIO(), check_status=False)
+
+        self.assertEqual(proc.returncode, errno.EPERM)
+
+        proc_stderr = proc.stderr.getvalue()
+        self.assertIn('EPERM', proc_stderr)
+        # ensure that the confirmation flag was recommended
+        self.assertIn(self.CONFIRM, proc_stderr)
+
+    def test_force_disable_idempotency(self):
+        '''
+        Test that running "ceph mgr module force disable volumes" passes when
+        volumes plugin was already force disabled.
+        '''
+        self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+                          f'{self.CONFIRM}')
+        sleep(5)
+
+        json_output = self.get_ceph_cmd_stdout('mgr module ls --format '
+                                              'json-pretty')
+        json_output = json.loads(json_output)
+
+        self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+        self.assertIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+
+        # XXX: this this test, running this command 2nd time should pass.
+        self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME}')
+
+    def test_force_disable_nonexistent_mod(self):
+        '''
+        Test that passing non-existent name to "ceph mgr module force disable"
+        command leads to an error.
+        '''
+        proc = self.run_ceph_cmd(
+            f'mgr module force disable abcd {self.CONFIRM}',
+            check_status=False, stderr=StringIO())
+        self.assertEqual(proc.returncode, errno.EINVAL)
+        self.assertIn('EINVAL', proc.stderr.getvalue())
+
+    def test_force_disable_non_alwayson_mod(self):
+        '''
+        Test that passing non-existent name to "ceph mgr module force disable"
+        command leads to an error.
+        '''
+        json_output = self.get_ceph_cmd_stdout(
+            'mgr module ls --format json-pretty', check_status=False,
+            stderr=StringIO())
+        output_dict = json.loads(json_output)
+        some_non_alwayson_mod = output_dict['enabled_modules'][0]
+
+        proc = self.run_ceph_cmd(
+            f'mgr module force disable {some_non_alwayson_mod} {self.CONFIRM}',
+            check_status=False, stderr=StringIO())
+        self.assertEqual(proc.returncode, errno.EINVAL)
+        self.assertIn('EINVAL', proc.stderr.getvalue())
+
+    def test_enabled_by_default(self):
+        '''
+        Test that volumes plugin is enabled by default and is also reported as
+        "always on".
+        '''
+        json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+        json_output = json.loads(json_output)
+
+        self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+    def test_disable_fails(self):
+        '''
+        Test that running "ceph mgr module disable volumes" fails with EPERM.
+
+        This is expected since volumes is an always-on module and therefore
+        it can only be disabled using command "ceph mgr module force disable
+        volumes".
+        '''
+        proc = self.run_ceph_cmd(f'mgr module disable {self.VOL_MOD_NAME}',
+                                 stderr=StringIO(), check_status=False)
+        self.assertEqual(proc.returncode, errno.EPERM)
+
+        proc_stderr = proc.stderr.getvalue()
+        self.assertIn('EPERM', proc_stderr)
+
+    def test_enable_idempotency(self):
+        '''
+        Test that enabling volumes plugin when it is already enabled doesn't
+        exit with non-zero return value.
+
+        Also test that it reports plugin as already enabled.
+        '''
+        proc = self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}',
+                                 stderr=StringIO())
+        self.assertEqual(proc.returncode, 0)
+
+        proc_stderr = proc.stderr.getvalue()
+        self.assertIn('already enabled', proc_stderr)
+        self.assertIn('always-on', proc_stderr)
+
+    def test_enable_post_disabling(self):
+        '''
+        Test that enabling volumes plugin after (force-)disabling it works
+        successfully.
+
+        Alo test "ceph mgr module ls" output for volumes plugin afterwards.
+        '''
+        self.run_ceph_cmd(f'mgr module force disable {self.VOL_MOD_NAME} '
+                          f'{self.CONFIRM}')
+        # give bit of time for plugin to be disabled.
+        sleep(5)
+
+        self.run_ceph_cmd(f'mgr module enable {self.VOL_MOD_NAME}')
+        # give bit of time for plugin to be functional again
+        sleep(5)
+        json_output = self.get_ceph_cmd_stdout('mgr module ls --format json')
+        json_output = json.loads(json_output)
+        self.assertIn(self.VOL_MOD_NAME, json_output['always_on_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['enabled_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['disabled_modules'])
+        self.assertNotIn(self.VOL_MOD_NAME, json_output['force_disabled_modules'])
+
+        # plugin is reported properly by "ceph mgr module ls" command, check if
+        # it is also working fine.
+        self.run_ceph_cmd('fs volume ls')
diff --git a/qa/tasks/cephfs/test_mirroring.py b/qa/tasks/cephfs/test_mirroring.py
index 55de1c7b928..078db6a4a6d 100644
--- a/qa/tasks/cephfs/test_mirroring.py
+++ b/qa/tasks/cephfs/test_mirroring.py
@@ -432,6 +432,34 @@ class TestMirroring(CephFSTestCase):
         self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
         self.mount_a.run_shell(["rmdir", "d1"])
 
+    def test_directory_command_ls(self):
+        dir1 = 'dls1'
+        dir2 = 'dls2'
+        self.mount_a.run_shell(["mkdir", dir1])
+        self.mount_a.run_shell(["mkdir", dir2])
+        self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        try:
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}')
+            self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}')
+            time.sleep(10)
+            dirs_list = json.loads(self.get_ceph_cmd_stdout("fs", "snapshot", "mirror", "ls", self.primary_fs_name))
+            # verify via asok
+            res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}',
+                                             'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}')
+            dir_count = res['snap_dirs']['dir_count']
+            self.assertTrue(len(dirs_list) == dir_count and f'/{dir1}' in dirs_list and f'/{dir2}' in dirs_list)
+        except CommandFailedError:
+            raise RuntimeError('Error listing directories')
+        except AssertionError:
+            raise RuntimeError('Wrong number of directories listed')
+        finally:
+            self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir1}')
+            self.remove_directory(self.primary_fs_name, self.primary_fs_id, f'/{dir2}')
+
+        self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
+        self.mount_a.run_shell(["rmdir", dir1])
+        self.mount_a.run_shell(["rmdir",  dir2])
+
     def test_add_relative_directory_path(self):
         self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
         try:
@@ -560,7 +588,7 @@ class TestMirroring(CephFSTestCase):
 
         # create a bunch of files in a directory to snap
         self.mount_a.run_shell(["mkdir", "d0"])
-        for i in range(50):
+        for i in range(100):
             self.mount_a.write_n_mb(os.path.join('d0', f'file.{i}'), 1)
 
         self.enable_mirroring(self.primary_fs_name, self.primary_fs_id)
@@ -574,7 +602,7 @@ class TestMirroring(CephFSTestCase):
         # take a snapshot
         self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
 
-        time.sleep(30)
+        time.sleep(60)
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
                                "client.mirror_remote@ceph", '/d0', 'snap0', 1)
         self.verify_snapshot('d0', 'snap0')
@@ -586,10 +614,10 @@ class TestMirroring(CephFSTestCase):
         self.assertGreater(second["counters"]["last_synced_start"], first["counters"]["last_synced_start"])
         self.assertGreater(second["counters"]["last_synced_end"], second["counters"]["last_synced_start"])
         self.assertGreater(second["counters"]["last_synced_duration"], 0)
-        self.assertEquals(second["counters"]["last_synced_bytes"], 52428800) # last_synced_bytes = 50 files of 1MB size each
+        self.assertEquals(second["counters"]["last_synced_bytes"], 104857600) # last_synced_bytes = 100 files of 1MB size each
 
         # some more IO
-        for i in range(75):
+        for i in range(150):
             self.mount_a.write_n_mb(os.path.join('d0', f'more_file.{i}'), 1)
 
         time.sleep(60)
@@ -597,7 +625,7 @@ class TestMirroring(CephFSTestCase):
         # take another snapshot
         self.mount_a.run_shell(["mkdir", "d0/.snap/snap1"])
 
-        time.sleep(60)
+        time.sleep(120)
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
                                "client.mirror_remote@ceph", '/d0', 'snap1', 2)
         self.verify_snapshot('d0', 'snap1')
@@ -609,7 +637,7 @@ class TestMirroring(CephFSTestCase):
         self.assertGreater(third["counters"]["last_synced_start"], second["counters"]["last_synced_end"])
         self.assertGreater(third["counters"]["last_synced_end"], third["counters"]["last_synced_start"])
         self.assertGreater(third["counters"]["last_synced_duration"], 0)
-        self.assertEquals(third["counters"]["last_synced_bytes"], 78643200) # last_synced_bytes = 75 files of 1MB size each
+        self.assertEquals(third["counters"]["last_synced_bytes"], 157286400) # last_synced_bytes = 150 files of 1MB size each
 
         # delete a snapshot
         self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
@@ -1372,7 +1400,7 @@ class TestMirroring(CephFSTestCase):
         self.mount_b.umount_wait()
         self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name)
 
-        # create a bunch of files in a directory to snap
+        # create some large files in 3 directories to snap
         self.mount_a.run_shell(["mkdir", "d0"])
         self.mount_a.run_shell(["mkdir", "d1"])
         self.mount_a.run_shell(["mkdir", "d2"])
@@ -1395,30 +1423,38 @@ class TestMirroring(CephFSTestCase):
         vbefore = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0]
         # take snapshots
         log.debug('taking snapshots')
-        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+        snap_name = "snap0"
+        self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"])
 
-        time.sleep(10)
         log.debug('checking snap in progress')
-        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
-                                         "client.mirror_remote@ceph", '/d0', 'snap0')
-        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
-                                         "client.mirror_remote@ceph", '/d1', 'snap0')
-        self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id,
-                                         "client.mirror_remote@ceph", '/d2', 'snap0')
+        peer_spec = "client.mirror_remote@ceph"
+        peer_uuid = self.get_peer_uuid(peer_spec)
+        with safe_while(sleep=3, tries=100, action=f'wait for status: {peer_spec}') as proceed:
+            while proceed():
+                res = self.mirror_daemon_command(f'peer status for fs: {self.primary_fs_name}',
+                                                 'fs', 'mirror', 'peer', 'status',
+                                                 f'{self.primary_fs_name}@{self.primary_fs_id}',
+                                                 peer_uuid)
+                if ('syncing' == res["/d0"]['state'] and 'syncing' == res["/d1"]['state'] and \
+                    'syncing' == res["/d2"]['state']):
+                    break
 
-        log.debug('removing directories 1')
+        log.debug('removing directory 1')
         self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0')
-        log.debug('removing directories 2')
+        log.debug('removing directory 2')
         self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1')
-        log.debug('removing directories 3')
+        log.debug('removing directory 3')
         self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
 
+        # Wait a while for the sync backoff
+        time.sleep(500)
+
         log.debug('removing snapshots')
-        self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"])
-        self.mount_a.run_shell(["rmdir", "d1/.snap/snap0"])
-        self.mount_a.run_shell(["rmdir", "d2/.snap/snap0"])
+        self.mount_a.run_shell(["rmdir", f"d0/.snap/{snap_name}"])
+        self.mount_a.run_shell(["rmdir", f"d1/.snap/{snap_name}"])
+        self.mount_a.run_shell(["rmdir", f"d2/.snap/{snap_name}"])
 
         for i in range(4):
             filename = f'file.{i}'
@@ -1438,26 +1474,27 @@ class TestMirroring(CephFSTestCase):
         self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2')
 
         log.debug('creating new snapshots...')
-        self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"])
-        self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"])
+        self.mount_a.run_shell(["mkdir", f"d0/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d1/.snap/{snap_name}"])
+        self.mount_a.run_shell(["mkdir", f"d2/.snap/{snap_name}"])
+
+        # Wait for the threads to finish
+        time.sleep(500)
 
-        time.sleep(60)
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
-                               "client.mirror_remote@ceph", '/d0', 'snap0', 1)
-        self.verify_snapshot('d0', 'snap0')
+                               "client.mirror_remote@ceph", '/d0', f'{snap_name}', 1)
+        self.verify_snapshot('d0', f'{snap_name}')
 
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
-                               "client.mirror_remote@ceph", '/d1', 'snap0', 1)
-        self.verify_snapshot('d1', 'snap0')
+                               "client.mirror_remote@ceph", '/d1', f'{snap_name}', 1)
+        self.verify_snapshot('d1', f'{snap_name}')
 
         self.check_peer_status(self.primary_fs_name, self.primary_fs_id,
-                               "client.mirror_remote@ceph", '/d2', 'snap0', 1)
-        self.verify_snapshot('d2', 'snap0')
+                               "client.mirror_remote@ceph", '/d2', f'{snap_name}', 1)
+        self.verify_snapshot('d2', f'{snap_name}')
         res = self.mirror_daemon_command(f'counter dump for fs: {self.primary_fs_name}', 'counter', 'dump')
         vafter = res[TestMirroring.PERF_COUNTER_KEY_NAME_CEPHFS_MIRROR_PEER][0]
         self.assertGreater(vafter["counters"]["snaps_synced"], vbefore["counters"]["snaps_synced"])
-        self.assertGreater(vafter["counters"]["snaps_deleted"], vbefore["counters"]["snaps_deleted"])
 
         self.disable_mirroring(self.primary_fs_name, self.primary_fs_id)
 
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py
index 7917bd9202f..14f54a784e7 100644
--- a/qa/tasks/cephfs/test_misc.py
+++ b/qa/tasks/cephfs/test_misc.py
@@ -558,16 +558,18 @@ class TestSessionClientEvict(CephFSTestCase):
         self.assertEqual(ce.exception.exitstatus, errno.EINVAL)
 
     def _evict_with_invalid_id(self, cmd):
+        info_initial = self.fs.rank_asok(cmd + ['ls'])
         # with invalid id
-        with self.assertRaises(CommandFailedError) as ce:
-            self.fs.rank_tell(cmd + ['evict', 'id=1'])
-        self.assertEqual(ce.exception.exitstatus, errno.ESRCH)
+        self.fs.rank_tell(cmd + ['evict', 'id=1'])
+        info = self.fs.rank_asok(cmd + ['ls'])
+        self.assertEqual(len(info), len(info_initial)) # session list is status-quo
 
     def _evict_with_negative_id(self, cmd):
+        info_initial = self.fs.rank_asok(cmd + ['ls'])
         # with negative id
-        with self.assertRaises(CommandFailedError) as ce:
-            self.fs.rank_tell(cmd + ['evict', 'id=-9'])
-        self.assertEqual(ce.exception.exitstatus, errno.ESRCH)
+        self.fs.rank_tell(cmd + ['evict', 'id=-9'])
+        info = self.fs.rank_asok(cmd + ['ls'])
+        self.assertEqual(len(info), len(info_initial)) # session list is status-quo
 
     def _evict_with_valid_id(self, cmd):
         info_initial = self.fs.rank_asok(cmd + ['ls'])
diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py
index 932d504d47f..19076ea44b3 100644
--- a/qa/tasks/cephfs/test_nfs.py
+++ b/qa/tasks/cephfs/test_nfs.py
@@ -8,6 +8,7 @@ from io import BytesIO, StringIO
 from tasks.mgr.mgr_test_case import MgrTestCase
 from teuthology import contextutil
 from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import Raw
 
 log = logging.getLogger(__name__)
 
@@ -319,7 +320,7 @@ class TestNFS(MgrTestCase):
                     else:
                         log.warning(f'{e}, retrying')
 
-    def _test_mnt(self, pseudo_path, port, ip, check=True):
+    def _test_mnt(self, pseudo_path, port, ip, check=True, datarw=False):
         '''
         Test mounting of created exports
         :param pseudo_path: It is the pseudo root name
@@ -347,12 +348,27 @@ class TestNFS(MgrTestCase):
         self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt'])
 
         try:
+            # Clean up volumes directory created by subvolume create by some tests
+            self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/volumes'])
             self.ctx.cluster.run(args=['touch', '/mnt/test'])
             out_mnt = self._sys_cmd(['ls', '/mnt'])
             self.assertEqual(out_mnt,  b'test\n')
+            if datarw:
+              self.ctx.cluster.run(args=['echo', 'test data', Raw('|'), 'tee', '/mnt/test1'])
+              out_test1 = self._sys_cmd(['cat', '/mnt/test1'])
+              self.assertEqual(out_test1,  b'test data\n')
         finally:
             self.ctx.cluster.run(args=['sudo', 'umount', '/mnt'])
 
+    def _test_data_read_write(self, pseudo_path, port, ip):
+        '''
+        Check if read/write works fine
+        '''
+        try:
+            self._test_mnt(pseudo_path, port, ip, True, True)
+        except CommandFailedError as e:
+            self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}")
+
     def _write_to_read_only_export(self, pseudo_path, port, ip):
         '''
         Check if write to read only export fails
@@ -599,6 +615,18 @@ class TestNFS(MgrTestCase):
         self._write_to_read_only_export(self.pseudo_path, port, ip)
         self._test_delete_cluster()
 
+    def test_data_read_write(self):
+        '''
+        Test date read and write on export.
+        '''
+        self._test_create_cluster()
+        self._create_export(export_id='1', create_fs=True,
+                            extra_cmd=['--pseudo-path', self.pseudo_path])
+        port, ip = self._get_port_ip_info()
+        self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
+        self._test_data_read_write(self.pseudo_path, port, ip)
+        self._test_delete_cluster()
+
     def test_cluster_info(self):
         '''
         Test cluster info outputs correct ip and hostname
diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py
index b5691c83852..ae1c1f2056c 100644
--- a/qa/tasks/cephfs/test_quota.py
+++ b/qa/tasks/cephfs/test_quota.py
@@ -115,9 +115,11 @@ class TestQuota(CephFSTestCase):
 
         readable_values = {"10K": "10240",
                            "100Ki": "102400",
+                           "100KiB": "102400",
                            "10M": "10485760",
                            "100Mi": "104857600",
                            "2G": "2147483648",
+                           "2GB": "2147483648",
                            "4Gi": "4294967296",
                            "1T": "1099511627776",
                            "2Ti": "2199023255552"}
@@ -135,7 +137,8 @@ class TestQuota(CephFSTestCase):
 
         self.mount_a.run_shell(["mkdir", "subdir"])
 
-        invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1"]
+        invalid_values = ["10A", "1y00Ki", "af00", "G", "", " ", "-1t", "-1",
+                          "1GT", "2MM", "5Di", "8Bi", "i", "7iB"]
         for invalid_value in invalid_values:
             with self.assertRaises(CommandFailedError):
                 self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py
index 2baefd72c3f..2ee3b6ac052 100644
--- a/qa/tasks/cephfs/test_volumes.py
+++ b/qa/tasks/cephfs/test_volumes.py
@@ -2388,7 +2388,7 @@ class TestSubvolumes(TestVolumesHelper):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # get earmark
@@ -2401,7 +2401,7 @@ class TestSubvolumes(TestVolumesHelper):
         self._fs_cmd("subvolume", "create", self.volname, subvolume)
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         # remove earmark
@@ -2559,7 +2559,7 @@ class TestSubvolumes(TestVolumesHelper):
             self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature))
 
         # set earmark
-        earmark = "smb.test"
+        earmark = "smb"
         self._fs_cmd("subvolume", "earmark", "set", self.volname, subvolume, "--earmark", earmark)
 
         subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume))
@@ -7876,7 +7876,22 @@ class TestCloneProgressReporter(TestVolumesHelper):
                 self.run_ceph_cmd('fs subvolume snapshot rm --force '
                                   f'--format json {v} {sv} {ss}')
 
-            self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+            try:
+                self.run_ceph_cmd(f'fs subvolume rm {v} {sv}')
+            except CommandFailedError as e:
+                if e.exitstatus == errno.ENOENT:
+                    log.info(
+                        'ignoring this error, perhaps subvolume was deleted '
+                        'during the test and snapshot deleted above is a '
+                        'retained snapshot. when a retained snapshot (which is '
+                        'snapshot retained despite of subvolume deletion) is '
+                        'deleted, the subvolume directory is also deleted '
+                        'along. and before retained snapshot deletion, the '
+                        'subvolume is reported by "subvolume ls" command, which'
+                        'is what probably caused confusion here')
+                    pass
+                else:
+                    raise
 
         # verify trash dir is clean
         self._wait_for_trash_empty()
@@ -8090,6 +8105,58 @@ class TestCloneProgressReporter(TestVolumesHelper):
         # and not cancelling these clone doesnt affect this test case.
         self.cancel_clones_and_ignore_if_finished(c)
 
+    def test_clone_after_subvol_is_removed(self):
+        '''
+        Initiate cloning after source subvolume has been deleted but with
+        snapshots retained and then test that, when this clone is in progress,
+        one progress bar is printed in output of command "ceph status" that
+        shows progress of this clone.
+        '''
+        v = self.volname
+        sv = 'sv1'
+        ss = 'ss1'
+        # XXX: "clone" must be part of clone name for sake of tearDown()
+        c = 'ss1clone1'
+
+        # XXX: without setting mds_snap_rstat to true rstats are not updated on
+        # a subvolume snapshot and therefore clone progress bar will not show
+        # any progress.
+        self.config_set('mds', 'mds_snap_rstat', 'true')
+
+        self.run_ceph_cmd(f'fs subvolume create {v} {sv} --mode=777')
+        size = self._do_subvolume_io(sv, None, None, 10, 1024)
+
+        self.run_ceph_cmd(f'fs subvolume snapshot create {v} {sv} {ss}')
+        self.wait_till_rbytes_is_right(v, sv, size)
+
+        self.run_ceph_cmd(f'fs subvolume rm {v} {sv} --retain-snapshots')
+        self.run_ceph_cmd(f'fs subvolume snapshot clone {v} {sv} {ss} {c}')
+
+        with safe_while(tries=15, sleep=10) as proceed:
+            while proceed():
+                pev = self.get_pevs_from_ceph_status(c)
+
+                if len(pev) < 1:
+                   continue
+                elif len(pev) > 1:
+                    raise RuntimeError('For 1 clone "ceph status" output has 2 '
+                                       'progress bars, it should have only 1 '
+                                       f'progress bar.\npev -\n{pev}')
+
+                # ensure that exactly 1 progress bar for cloning is present in
+                # "ceph status" output
+                msg = ('"progress_events" dict in "ceph status" output must have '
+                       f'exactly one entry.\nprogress_event dict -\n{pev}')
+                self.assertEqual(len(pev), 1, msg)
+
+                pev_msg = tuple(pev.values())[0]['message']
+                self.assertIn('1 ongoing clones', pev_msg)
+                break
+
+        # allowing clone jobs to finish will consume too much time and space
+        # and not cancelling these clone doesnt affect this test case.
+        self.cancel_clones_and_ignore_if_finished(c)
+
     def test_clones_equal_to_cloner_threads(self):
         '''
         Test that one progress bar is printed in output of "ceph status" output
diff --git a/qa/tasks/fwd_scrub.py b/qa/tasks/fwd_scrub.py
index 2ac92439de6..d955d232c2c 100644
--- a/qa/tasks/fwd_scrub.py
+++ b/qa/tasks/fwd_scrub.py
@@ -33,6 +33,8 @@ class ForwardScrubber(ThrasherGreenlet):
     def _run(self):
         try:
             self.do_scrub()
+        except ThrasherGreenlet.Stopped:
+            pass
         except Exception as e:
             self.set_thrasher_exception(e)
             self.logger.exception("exception:")
diff --git a/qa/tasks/mgr/dashboard/test_osd.py b/qa/tasks/mgr/dashboard/test_osd.py
index 07c69ddc47c..be7afccf331 100644
--- a/qa/tasks/mgr/dashboard/test_osd.py
+++ b/qa/tasks/mgr/dashboard/test_osd.py
@@ -11,6 +11,7 @@ from .helper import (DashboardTestCase, JAny, JLeaf, JList, JObj, JTuple,
 class OsdTest(DashboardTestCase):
 
     AUTH_ROLES = ['cluster-manager']
+    _VERSION = '1.1'
 
     @classmethod
     def setUpClass(cls):
@@ -24,7 +25,7 @@ class OsdTest(DashboardTestCase):
 
     @DashboardTestCase.RunAs('test', 'test', ['block-manager'])
     def test_access_permissions(self):
-        self._get('/api/osd')
+        self._get('/api/osd', version=self._VERSION)
         self.assertStatus(403)
         self._get('/api/osd/0')
         self.assertStatus(403)
@@ -33,7 +34,7 @@ class OsdTest(DashboardTestCase):
         self.assertSchema(data, JObj({p: JAny(none=False) for p in properties}, allow_unknown=True))
 
     def test_list(self):
-        data = self._get('/api/osd')
+        data = self._get('/api/osd', version=self._VERSION)
         self.assertStatus(200)
 
         self.assertGreaterEqual(len(data), 1)
diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py
index b89f123c97e..42e357294d9 100644
--- a/qa/tasks/nvmeof.py
+++ b/qa/tasks/nvmeof.py
@@ -32,6 +32,7 @@ class Nvmeof(Task):
             gateway_config:
                 namespaces_count: 10
                 cli_version: latest
+                create_mtls_secrets: False 
                     
     """
 
@@ -69,6 +70,7 @@ class Nvmeof(Task):
         self.serial = gateway_config.get('serial', 'SPDK00000000000001')
         self.port = gateway_config.get('port', '4420')
         self.srport = gateway_config.get('srport', '5500')
+        self.create_mtls_secrets = gateway_config.get('create_mtls_secrets', False)
 
     def deploy_nvmeof(self):
         """
@@ -147,7 +149,38 @@ class Nvmeof(Task):
                 started=True,
             )
         log.info("[nvmeof]: executed deploy_nvmeof successfully!")
-        
+
+    def write_mtls_config(self, gateway_ips):
+        log.info("[nvmeof]: writing mtls config...")
+        allowed_ips = ""
+        for ip in gateway_ips:
+            allowed_ips += ("IP:" + ip + ",")
+        self.remote.run(
+            args=[
+                "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/server.key",
+                "-out", "/etc/ceph/server.crt", "-days", "3650", "-subj", "/CN=my.server", "-addext", f"subjectAltName={allowed_ips[:-1]}" 
+            ]
+        )
+        self.remote.run(
+            args=[
+                "sudo", "openssl", "req", "-x509", "-newkey", "rsa:4096", "-nodes", "-keyout", "/etc/ceph/client.key",
+                "-out", "/etc/ceph/client.crt", "-days", "3650", "-subj", "/CN=client1"
+            ]
+        )
+        secrets_files = {"/etc/ceph/server.key": None, 
+                 "/etc/ceph/server.crt": None, 
+                 "/etc/ceph/client.key": None, 
+                 "/etc/ceph/client.crt": None, 
+                }
+        for file in secrets_files.keys():
+            secrets_files[file] = self.remote.read_file(path=file, sudo=True)
+
+        for remote in self.ctx.cluster.remotes.keys():
+            for remote_file in secrets_files.keys():
+                data = secrets_files[remote_file]
+                remote.sudo_write_file(path=remote_file, data=data, mode='0644')
+        log.info("[nvmeof]: written mtls config!")
+
     def set_gateway_cfg(self):
         log.info('[nvmeof]: running set_gateway_cfg...')
         ip_address = self.remote.ip_address
@@ -174,6 +207,8 @@ class Nvmeof(Task):
                 data=conf_data,
                 sudo=True
             )
+        if self.create_mtls_secrets: 
+            self.write_mtls_config(gateway_ips)
         log.info("[nvmeof]: executed set_gateway_cfg successfully!")
 
 
diff --git a/qa/tasks/tox.py b/qa/tasks/tox.py
index 61c5b7411b4..4e4dee966d5 100644
--- a/qa/tasks/tox.py
+++ b/qa/tasks/tox.py
@@ -35,7 +35,7 @@ def task(ctx, config):
         ctx.cluster.only(client).run(args=[
             'source', '{tvdir}/bin/activate'.format(tvdir=tvdir),
             run.Raw('&&'),
-            'pip', 'install', 'tox==3.15.0'
+            'pip', 'install', 'tox'
         ])
 
     # export the path Keystone and Tempest
diff --git a/qa/workunits/client/test_oc_disabled.sh b/qa/workunits/client/test_oc_disabled.sh
new file mode 100755
index 00000000000..88552aa50bd
--- /dev/null
+++ b/qa/workunits/client/test_oc_disabled.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+set -ex
+
+ceph_test_client --client_oc=false
diff --git a/qa/workunits/nvmeof/mtls_test.sh b/qa/workunits/nvmeof/mtls_test.sh
new file mode 100755
index 00000000000..e13ca530e8d
--- /dev/null
+++ b/qa/workunits/nvmeof/mtls_test.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -ex
+source /etc/ceph/nvmeof.env
+
+# install yq
+wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /tmp/yq && chmod +x /tmp/yq
+
+subjectAltName=$(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | sed 's/,/,IP:/g')
+
+# create mtls spec files
+ceph orch ls nvmeof --export > /tmp/gw-conf-original.yaml
+sudo /tmp/yq ".spec.enable_auth=true | \
+    .spec.root_ca_cert=\"mountcert\" | \
+    .spec.client_cert = load_str(\"/etc/ceph/client.crt\") | \
+    .spec.client_key = load_str(\"/etc/ceph/client.key\") | \
+    .spec.server_cert = load_str(\"/etc/ceph/server.crt\") | \
+    .spec.server_key = load_str(\"/etc/ceph/server.key\")" /tmp/gw-conf-original.yaml > /tmp/gw-conf-with-mtls.yaml
+cp /tmp/gw-conf-original.yaml /tmp/gw-conf-without-mtls.yaml 
+sudo /tmp/yq '.spec.enable_auth=false' -i /tmp/gw-conf-without-mtls.yaml
+
+wait_for_service() {
+    MAX_RETRIES=30
+    for ((RETRY_COUNT=1; RETRY_COUNT<=MAX_RETRIES; RETRY_COUNT++)); do
+
+        if ceph orch ls --refresh | grep -q "nvmeof"; then
+            echo "Found nvmeof in the output!"
+            break
+        fi
+        if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then
+            echo "Reached maximum retries ($MAX_RETRIES). Exiting."
+            break
+        fi
+        sleep 5
+    done
+    ceph orch ps
+    ceph orch ls --refresh
+}
+
+# deploy mtls
+cat /tmp/gw-conf-with-mtls.yaml 
+ceph orch apply -i /tmp/gw-conf-with-mtls.yaml
+ceph orch redeploy nvmeof.mypool.mygroup0 
+sleep 100
+wait_for_service
+
+
+# test
+IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES"
+for i in "${!gateway_ips[@]}"
+do
+    ip="${gateway_ips[i]}"
+    sudo podman run -v /etc/ceph/server.crt:/server.crt:z -v /etc/ceph/client.crt:/client.crt:z \
+        -v /etc/ceph/client.key:/client.key:z  \
+        -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \
+        --client-key /client.key --client-cert /client.crt --server-cert /server.crt --format json subsystem list
+done
+
+
+# remove mtls
+cat /tmp/gw-conf-without-mtls.yaml 
+ceph orch apply -i /tmp/gw-conf-without-mtls.yaml
+ceph orch redeploy nvmeof.mypool.mygroup0 
+sleep 100
+wait_for_service
+
+
+# test
+IFS=',' read -ra gateway_ips <<< "$NVMEOF_GATEWAY_IP_ADDRESSES"
+for i in "${!gateway_ips[@]}"
+do
+    ip="${gateway_ips[i]}"
+    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $ip --server-port $NVMEOF_SRPORT \
+        --format json subsystem list
+done
+
diff --git a/qa/workunits/nvmeof/setup_subsystem.sh b/qa/workunits/nvmeof/setup_subsystem.sh
index fb72e1d6402..cc4024323eb 100755
--- a/qa/workunits/nvmeof/setup_subsystem.sh
+++ b/qa/workunits/nvmeof/setup_subsystem.sh
@@ -29,7 +29,7 @@ list_subsystems () {
 # add all subsystems
 for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
     subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
-    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn
+    sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT subsystem add --subsystem $subsystem_nqn --no-group-append
 done
 
 list_subsystems
diff --git a/qa/workunits/rbd/cli_migration.sh b/qa/workunits/rbd/cli_migration.sh
index b044e747cbb..3af19420957 100755
--- a/qa/workunits/rbd/cli_migration.sh
+++ b/qa/workunits/rbd/cli_migration.sh
@@ -5,12 +5,16 @@ TEMPDIR=
 IMAGE1=image1
 IMAGE2=image2
 IMAGE3=image3
-IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3}"
+NAMESPACE1=namespace1
+NAMESPACE2=namespace2
+NAMESPACES="${NAMESPACE1} ${NAMESPACE2}"
+IMAGES="${IMAGE1} ${IMAGE2} ${IMAGE3} rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}"
 
 cleanup() {
     kill_nbd_server
     cleanup_tempdir
     remove_images
+    remove_namespaces
 }
 
 setup_tempdir() {
@@ -42,8 +46,11 @@ create_base_image() {
 export_raw_image() {
     local image=$1
 
-    rm -rf "${TEMPDIR}/${image}"
-    rbd export ${image} "${TEMPDIR}/${image}"
+    # Replace slashes (/) with underscores (_) for namespace images
+    local export_image="${image//\//_}"
+
+    rm -rf "${TEMPDIR}/${export_image}"
+    rbd export "${image}" "${TEMPDIR}/${export_image}"
 }
 
 export_base_image() {
@@ -69,6 +76,13 @@ remove_images() {
     done
 }
 
+remove_namespaces() {
+    for namespace in ${NAMESPACES}
+    do
+        rbd namespace remove rbd/${namespace} || true
+    done
+}
+
 kill_nbd_server() {
     pkill -9 qemu-nbd || true
 }
@@ -90,6 +104,11 @@ compare_images() {
     local ret=0
 
     export_raw_image ${dst_image}
+
+    # Replace slashes (/) with underscores (_) for namespace images
+    src_image="${src_image//\//_}"
+    dst_image="${dst_image//\//_}"
+
     if ! cmp "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
     then
         show_diff "${TEMPDIR}/${src_image}" "${TEMPDIR}/${dst_image}"
@@ -99,18 +118,26 @@ compare_images() {
 }
 
 test_import_native_format() {
-    local base_image=$1
-    local dest_image=$2
+    local base_image_spec=$1
+    local dest_image_spec=$2
+
+    # if base image is from namespace
+    local base_namespace=""
+    local base_image=${base_image_spec}
+    if [[ "${base_image_spec}" == rbd/*/* ]]; then
+        base_namespace=$(basename "$(dirname "${base_image_spec}")")
+        base_image=$(basename "${base_image_spec}")
+    fi
 
-    rbd migration prepare --import-only "rbd/${base_image}@2" ${dest_image}
-    rbd migration abort ${dest_image}
+    rbd migration prepare --import-only "${base_image_spec}@2" ${dest_image_spec}
+    rbd migration abort ${dest_image_spec}
 
     local pool_id=$(ceph osd pool ls detail --format xml | xmlstarlet sel -t -v "//pools/pool[pool_name='rbd']/pool_id")
     cat > ${TEMPDIR}/spec.json <<EOF
 {
   "type": "native",
   "pool_id": ${pool_id},
-  "pool_namespace": "",
+  "pool_namespace": "${base_namespace}",
   "image_name": "${base_image}",
   "snap_name": "2"
 }
@@ -118,85 +145,85 @@ EOF
     cat ${TEMPDIR}/spec.json
 
     rbd migration prepare --import-only \
-	--source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+        --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec}
 
-    compare_images "${base_image}@1" "${dest_image}@1"
-    compare_images "${base_image}@2" "${dest_image}@2"
+    compare_images "${base_image_spec}@1" "${dest_image_spec}@1"
+    compare_images "${base_image_spec}@2" "${dest_image_spec}@2"
 
-    rbd migration abort ${dest_image}
+    rbd migration abort ${dest_image_spec}
 
     rbd migration prepare --import-only \
-        --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
-    rbd migration execute ${dest_image}
+        --source-spec-path ${TEMPDIR}/spec.json ${dest_image_spec}
+    rbd migration execute ${dest_image_spec}
 
-    compare_images "${base_image}@1" "${dest_image}@1"
-    compare_images "${base_image}@2" "${dest_image}@2"
+    compare_images "${base_image_spec}@1" "${dest_image_spec}@1"
+    compare_images "${base_image_spec}@2" "${dest_image_spec}@2"
 
-    rbd migration abort ${dest_image}
+    rbd migration abort ${dest_image_spec}
 
     # no snap name or snap id
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\"}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\"}" \
+        ${dest_image_spec}
 
     # invalid source spec JSON
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": non-existing}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": non-existing}" \
+        ${dest_image_spec}
 
     # non-existing snap name
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": \"non-existing\"}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"non-existing\"}" \
+        ${dest_image_spec}
 
     # invalid snap name
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": 123456}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": 123456}" \
+        ${dest_image_spec}
 
     # non-existing snap id passed as int
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": 123456}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": 123456}" \
+        ${dest_image_spec}
 
     # non-existing snap id passed as string
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": \"123456\"}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"123456\"}" \
+        ${dest_image_spec}
 
     # invalid snap id
     expect_false rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": \"foobar\"}" \
-        ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"foobar\"}" \
+        ${dest_image_spec}
 
     # snap id passed as int
-    local snap_id=$(rbd snap ls ${base_image} --format xml | xmlstarlet sel -t -v "//snapshots/snapshot[name='2']/id")
+    local snap_id=$(rbd snap ls ${base_image_spec} --format xml | xmlstarlet sel -t -v "//snapshots/snapshot[name='2']/id")
     rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": ${snap_id}}" \
-        ${dest_image}
-    rbd migration abort ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": ${snap_id}}" \
+        ${dest_image_spec}
+    rbd migration abort ${dest_image_spec}
 
     # snap id passed as string
     rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_id\": \"${snap_id}\"}" \
-        ${dest_image}
-    rbd migration abort ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_id\": \"${snap_id}\"}" \
+        ${dest_image_spec}
+    rbd migration abort ${dest_image_spec}
 
     rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
-        ${dest_image}
-    rbd migration abort ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_id\": ${pool_id}, \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
+        ${dest_image_spec}
+    rbd migration abort ${dest_image_spec}
 
     rbd migration prepare --import-only \
-        --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
-        ${dest_image}
-    rbd migration execute ${dest_image}
-    rbd migration commit ${dest_image}
+        --source-spec "{\"type\": \"native\", \"pool_name\": \"rbd\", \"pool_namespace\": \"${base_namespace}\", \"image_name\": \"${base_image}\", \"snap_name\": \"2\"}" \
+        ${dest_image_spec}
+    rbd migration execute ${dest_image_spec}
+    rbd migration commit ${dest_image_spec}
 
-    compare_images "${base_image}@1" "${dest_image}@1"
-    compare_images "${base_image}@2" "${dest_image}@2"
+    compare_images "${base_image_spec}@1" "${dest_image_spec}@1"
+    compare_images "${base_image_spec}@2" "${dest_image_spec}@2"
 
-    remove_image "${dest_image}"
+    remove_image "${dest_image_spec}"
 }
 
 test_import_qcow_format() {
@@ -337,12 +364,12 @@ EOF
     cat ${TEMPDIR}/spec.json
 
     cat ${TEMPDIR}/spec.json | rbd migration prepare --import-only \
-	--source-spec-path - ${dest_image}
+        --source-spec-path - ${dest_image}
     compare_images ${base_image} ${dest_image}
     rbd migration abort ${dest_image}
 
     rbd migration prepare --import-only \
-	--source-spec-path ${TEMPDIR}/spec.json ${dest_image}
+        --source-spec-path ${TEMPDIR}/spec.json ${dest_image}
     rbd migration execute ${dest_image}
     rbd migration commit ${dest_image}
 
@@ -587,4 +614,18 @@ test_import_nbd_stream_qcow2 ${IMAGE2} ${IMAGE3}
 test_import_raw_format ${IMAGE1} ${IMAGE2}
 test_import_nbd_stream_raw ${IMAGE1} ${IMAGE2}
 
+rbd namespace create rbd/${NAMESPACE1}
+rbd namespace create rbd/${NAMESPACE2}
+create_base_image rbd/${NAMESPACE1}/${IMAGE1}
+export_base_image rbd/${NAMESPACE1}/${IMAGE1}
+
+# Migration from namespace to namespace
+test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}
+
+# Migration from namespace to non-namespace
+test_import_native_format rbd/${NAMESPACE1}/${IMAGE1} ${IMAGE2}
+
+# Migration from non-namespace to namespace
+test_import_native_format ${IMAGE1} rbd/${NAMESPACE2}/${IMAGE2}
+
 echo OK
diff --git a/qa/workunits/rbd/luks-encryption.sh b/qa/workunits/rbd/luks-encryption.sh
index 97cb5a0fe87..b6305cb46c6 100755
--- a/qa/workunits/rbd/luks-encryption.sh
+++ b/qa/workunits/rbd/luks-encryption.sh
@@ -2,7 +2,7 @@
 set -ex
 
 CEPH_ID=${CEPH_ID:-admin}
-TMP_FILES="/tmp/passphrase /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2"
+TMP_FILES="/tmp/passphrase /tmp/passphrase1 /tmp/passphrase2 /tmp/testdata1 /tmp/testdata2 /tmp/cmpdata /tmp/rawexport /tmp/export.qcow2"
 
 _sudo()
 {
@@ -278,8 +278,7 @@ function test_migration_clone() {
   rbd migration prepare testimg1 testimg2
 
   # test reading
-  # FIXME: https://tracker.ceph.com/issues/63184
-  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
   cmp $LIBRBD_DEV /tmp/cmpdata
 
   # trigger copyup for an unwritten area
@@ -297,8 +296,7 @@ function test_migration_clone() {
   _sudo rbd device unmap -t nbd $LIBRBD_DEV
 
   # test reading on a fresh mapping
-  # FIXME: https://tracker.ceph.com/issues/63184
-  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase)
   cmp $LIBRBD_DEV /tmp/cmpdata
   _sudo rbd device unmap -t nbd $LIBRBD_DEV
 
@@ -320,6 +318,85 @@ function test_migration_clone() {
   rbd rm testimg
 }
 
+function test_migration_open_clone_chain() {
+  rbd create --size 32M testimg
+  rbd encryption format testimg luks1 /tmp/passphrase
+  rbd snap create testimg@snap
+  rbd snap protect testimg@snap
+
+  rbd clone testimg@snap testimg1
+  rbd encryption format testimg1 luks2 /tmp/passphrase1
+  rbd snap create testimg1@snap
+  rbd snap protect testimg1@snap
+
+  rbd clone testimg1@snap testimg2
+  rbd encryption format testimg2 luks1 /tmp/passphrase2
+
+  # 1. X <-- X <-- X
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+
+  # 2. X <-- X <-- migrating
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg2
+
+  # 3. X <-- migrating <-- X
+  rbd migration prepare testimg1 testimg1
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg1
+
+  # 4. migrating <-- X <-- X
+  rbd migration prepare testimg testimg
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg
+
+  # 5. migrating <-- migrating <-- X
+  rbd migration prepare testimg testimg
+  rbd migration prepare testimg1 testimg1
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg1
+  rbd migration abort testimg
+
+  # 6. migrating <-- X <-- migrating
+  rbd migration prepare testimg testimg
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg2
+  rbd migration abort testimg
+
+  # 7. X <-- migrating <-- migrating
+  rbd migration prepare testimg1 testimg1
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+  rbd migration abort testimg2
+  rbd migration abort testimg1
+
+  # 8. migrating <-- migrating <-- migrating
+  rbd migration prepare testimg testimg
+  rbd migration prepare testimg1 testimg1
+  rbd migration prepare testimg2 testimg2
+  LIBRBD_DEV=$(_sudo rbd -p rbd map testimg2 -t nbd -o encryption-passphrase-file=/tmp/passphrase2,encryption-passphrase-file=/tmp/passphrase1,encryption-passphrase-file=/tmp/passphrase)
+  _sudo rbd device unmap -t nbd $LIBRBD_DEV
+
+  rbd migration abort testimg2
+  rbd rm testimg2
+  rbd migration abort testimg1
+  rbd snap unprotect testimg1@snap
+  rbd snap rm testimg1@snap
+  rbd rm testimg1
+  rbd migration abort testimg
+  rbd snap unprotect testimg@snap
+  rbd snap rm testimg@snap
+  rbd rm testimg
+}
+
 function get_nbd_device_paths {
   rbd device list -t nbd | tail -n +2 | egrep "\s+rbd\s+testimg" | awk '{print $5;}'
 }
@@ -343,6 +420,7 @@ function clean_up {
   rbd snap unprotect testimg1@snap || true
   rbd snap remove testimg1@snap || true
   rbd remove testimg1 || true
+  rbd migration abort testimg || true
   rbd snap remove testimg@snap2 || true
   rbd snap remove testimg@snap1 || true
   rbd snap unprotect testimg@snap || true
@@ -371,6 +449,7 @@ dd if=/dev/urandom of=/tmp/testdata2 bs=4M count=4
 
 # create passphrase files
 printf "pass\0word\n" > /tmp/passphrase
+printf "  passwo\nrd 1,1" > /tmp/passphrase1
 printf "\t password2   " > /tmp/passphrase2
 
 # create an image
@@ -401,4 +480,6 @@ test_migration_clone luks1
 rbd create --size 48M testimg
 test_migration_clone luks2
 
+test_migration_open_clone_chain
+
 echo OK
diff --git a/qa/workunits/rgw/s3_utilities.pm b/qa/workunits/rgw/s3_utilities.pm
index 3c3fae900e8..5a91db9d1fd 100644
--- a/qa/workunits/rgw/s3_utilities.pm
+++ b/qa/workunits/rgw/s3_utilities.pm
@@ -21,7 +21,7 @@ sub get_timestamp {
    if ($min < 10) { $min = "0$min"; }
    if ($sec < 10) { $sec = "0$sec"; }
    $year=$year+1900;
-   return $year . '_' . $mon . '_' . $mday . '_' . $hour . '_' . $min . '_' . $sec;
+   return $year . '-' . $mon . '-' . $mday . '-' . $hour . '-' . $min . '-' . $sec;
 }
 
 # Function to check if radosgw is already running
@@ -195,11 +195,12 @@ sub run_s3
                 host                  => $hostname,
                 secure                => 0,
                 retry                 => 1,
+                dns_bucket_names      => 0,
             }
       );
     }
 
-our $bucketname = 'buck_'.get_timestamp();
+our $bucketname = 'buck-'.get_timestamp();
 # create a new bucket (the test bucket)
 our $bucket = $s3->add_bucket( { bucket => $bucketname } )
       or die $s3->err. "bucket $bucketname create failed\n". $s3->errstr;
diff --git a/qa/workunits/rgw/test_rgw_bucket_check.py b/qa/workunits/rgw/test_rgw_bucket_check.py
index bfa6d65d6e7..33936df2401 100755
--- a/qa/workunits/rgw/test_rgw_bucket_check.py
+++ b/qa/workunits/rgw/test_rgw_bucket_check.py
@@ -173,6 +173,7 @@ def main():
     exec_cmd(f'radosgw-admin bucket check --fix --bucket {BUCKET_NAME}')
     out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys')
     json_out = json.loads(out)
+    log.info(f'"bucket check unlinked" returned {json_out}, expecting {unlinked_keys}')
     assert len(json_out) == len(unlinked_keys)
     bucket.object_versions.all().delete()
     out = exec_cmd(f'radosgw-admin bucket stats --bucket {BUCKET_NAME}')
diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc
index 1985c85435e..72921e6d9f0 100644
--- a/src/blk/kernel/KernelDevice.cc
+++ b/src/blk/kernel/KernelDevice.cc
@@ -65,7 +65,6 @@ KernelDevice::KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, ai
     discard_callback(d_cb),
     discard_callback_priv(d_cbpriv),
     aio_stop(false),
-    discard_stop(false),
     aio_thread(this),
     injecting_crash(0)
 {
@@ -548,7 +547,7 @@ void KernelDevice::_aio_stop()
   }
 }
 
-void KernelDevice::_discard_update_threads()
+void KernelDevice::_discard_update_threads(bool discard_stop)
 {
   std::unique_lock l(discard_lock);
 
@@ -570,28 +569,27 @@ void KernelDevice::_discard_update_threads()
     }
   // Decrease? Signal threads after telling them to stop
   } else if (newcount < oldcount) {
+    std::vector<std::shared_ptr<DiscardThread>> discard_threads_to_stop;
     dout(10) << __func__ << " stopping " << (oldcount - newcount) << " existing discard threads" << dendl;
 
     // Signal the last threads to quit, and stop tracking them
-    for(uint64_t i = oldcount; i > newcount; i--)
-    {
+    for(uint64_t i = oldcount; i > newcount; i--) {
       discard_threads[i-1]->stop = true;
-      discard_threads[i-1]->detach();
+      discard_threads_to_stop.push_back(discard_threads[i-1]);
     }
-    discard_threads.resize(newcount);
-
     discard_cond.notify_all();
+    discard_threads.resize(newcount);
+    l.unlock();
+    for (auto &t : discard_threads_to_stop) {
+      t->join();
+    }
   }
 }
 
 void KernelDevice::_discard_stop()
 {
   dout(10) << __func__ << dendl;
-
-  discard_stop = true;
-  _discard_update_threads();
-  discard_drain();
-
+  _discard_update_threads(true);
   dout(10) << __func__ << " stopped" << dendl;
 }
 
diff --git a/src/blk/kernel/KernelDevice.h b/src/blk/kernel/KernelDevice.h
index 42e542a6cc8..ac555cdd3da 100644
--- a/src/blk/kernel/KernelDevice.h
+++ b/src/blk/kernel/KernelDevice.h
@@ -58,7 +58,6 @@ private:
   aio_callback_t discard_callback;
   void *discard_callback_priv;
   bool aio_stop;
-  bool discard_stop;
   std::unique_ptr<PerfCounters> logger;
 
   ceph::mutex discard_lock = ceph::make_mutex("KernelDevice::discard_lock");
@@ -100,7 +99,7 @@ private:
   int _aio_start();
   void _aio_stop();
 
-  void _discard_update_threads();
+  void _discard_update_threads(bool discard_stop = false);
   void _discard_stop();
   bool _discard_started();
 
diff --git a/src/ceph-volume/ceph_volume/__init__.py b/src/ceph-volume/ceph_volume/__init__.py
index b10100c0218..814619cfddd 100644
--- a/src/ceph-volume/ceph_volume/__init__.py
+++ b/src/ceph-volume/ceph_volume/__init__.py
@@ -6,6 +6,7 @@ from collections import namedtuple
 sys_info = namedtuple('sys_info', ['devices'])
 sys_info.devices = dict()
 logger = logging.getLogger(__name__)
+BEING_REPLACED_HEADER: str = 'CEPH_DEVICE_BEING_REPLACED'
 
 
 class AllowLoopDevices:
diff --git a/src/ceph-volume/ceph_volume/api/lvm.py b/src/ceph-volume/ceph_volume/api/lvm.py
index dcc4f186272..fc376f891fd 100644
--- a/src/ceph-volume/ceph_volume/api/lvm.py
+++ b/src/ceph-volume/ceph_volume/api/lvm.py
@@ -6,11 +6,12 @@ set of utilities for interacting with LVM.
 import logging
 import os
 import uuid
-import re
 from itertools import repeat
 from math import floor
 from ceph_volume import process, util, conf
 from ceph_volume.exceptions import SizeAllocationError
+from typing import Any, Dict
+
 
 logger = logging.getLogger(__name__)
 
@@ -808,13 +809,16 @@ LV_CMD_OPTIONS =  ['--noheadings', '--readonly', '--separator=";"', '-a',
                    '--units=b', '--nosuffix']
 
 
-class Volume(object):
+class Volume:
     """
     Represents a Logical Volume from LVM, with some top-level attributes like
     ``lv_name`` and parsed tags as a dictionary of key/value pairs.
     """
 
-    def __init__(self, **kw):
+    def __init__(self, **kw: str) -> None:
+        self.lv_path: str = ''
+        self.lv_name: str = ''
+        self.lv_uuid: str = ''
         for k, v in kw.items():
             setattr(self, k, v)
         self.lv_api = kw
@@ -825,13 +829,13 @@ class Volume(object):
         self.encrypted = self.tags.get('ceph.encrypted', '0') == '1'
         self.used_by_ceph = 'ceph.osd_id' in self.tags
 
-    def __str__(self):
+    def __str__(self) -> str:
         return '<%s>' % self.lv_api['lv_path']
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return self.__str__()
 
-    def as_dict(self):
+    def as_dict(self) -> Dict[str, Any]:
         obj = {}
         obj.update(self.lv_api)
         obj['tags'] = self.tags
@@ -840,7 +844,7 @@ class Volume(object):
         obj['path'] = self.lv_path
         return obj
 
-    def report(self):
+    def report(self) -> Dict[str, Any]:
         if not self.used_by_ceph:
             return {
                 'name': self.lv_name,
@@ -1210,39 +1214,3 @@ def get_lv_by_fullname(full_name):
     except ValueError:
         res_lv = None
     return res_lv
-
-def get_lv_path_from_mapper(mapper):
-    """
-    This functions translates a given mapper device under the format:
-    /dev/mapper/LV to the format /dev/VG/LV.
-    eg:
-    from:
-    /dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9-osd--block--32e8e896--172e--4a38--a06a--3702598510ec
-    to:
-    /dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec
-    """
-    results = re.split(r'^\/dev\/mapper\/(.+\w)-(\w.+)', mapper)
-    results = list(filter(None, results))
-
-    if len(results) != 2:
-        return None
-
-    return f"/dev/{results[0].replace('--', '-')}/{results[1].replace('--', '-')}"
-
-def get_mapper_from_lv_path(lv_path):
-    """
-    This functions translates a given lv path under the format:
-    /dev/VG/LV to the format /dev/mapper/LV.
-    eg:
-    from:
-    /dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec
-    to:
-    /dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9-osd--block--32e8e896--172e--4a38--a06a--3702598510ec
-    """
-    results = re.split(r'^\/dev\/(.+\w)-(\w.+)', lv_path)
-    results = list(filter(None, results))
-
-    if len(results) != 2:
-        return None
-
-    return f"/dev/mapper/{results[0].replace('-', '--')}/{results[1].replace('-', '--')}"
diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index c1bef82c109..c278de43eb0 100644
--- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -5,12 +5,13 @@ import time
 
 from textwrap import dedent
 
-from ceph_volume import decorators, terminal, process
+from ceph_volume import decorators, terminal, process, BEING_REPLACED_HEADER
 from ceph_volume.api import lvm as api
 from ceph_volume.util import system, encryption, disk, arg_validators, str_to_int, merge_dict
 from ceph_volume.util.device import Device
 from ceph_volume.systemd import systemctl
-from typing import List
+from ceph_volume.devices.raw.list import direct_report
+from typing import Any, Dict, List, Set
 
 logger = logging.getLogger(__name__)
 mlogger = terminal.MultiLogger(__name__)
@@ -96,84 +97,127 @@ def zap_data(path):
     ])
 
 
-def find_associated_devices(osd_id=None, osd_fsid=None):
-    """
-    From an ``osd_id`` and/or an ``osd_fsid``, filter out all the LVs in the
-    system that match those tag values, further detect if any partitions are
-    part of the OSD, and then return the set of LVs and partitions (if any).
-    """
-    lv_tags = {}
-    if osd_id:
-        lv_tags['ceph.osd_id'] = osd_id
-    if osd_fsid:
-        lv_tags['ceph.osd_fsid'] = osd_fsid
-
-    lvs = api.get_lvs(tags=lv_tags)
-    if not lvs:
-        raise RuntimeError('Unable to find any LV for zapping OSD: '
-                           '%s' % osd_id or osd_fsid)
+class Zap:
+    help = 'Removes all data and filesystems from a logical volume or partition.'
 
-    devices_to_zap = ensure_associated_lvs(lvs, lv_tags)
-    return [Device(path) for path in set(devices_to_zap) if path]
+    def __init__(self, argv: List[str]) -> None:
+        self.argv = argv
+        self.osd_ids_to_zap: List[str] = []
 
+    def ensure_associated_raw(self, raw_report: Dict[str, Any]) -> List[str]:
+        osd_id: str = self.args.osd_id
+        osd_uuid: str = self.args.osd_fsid
+        raw_devices: Set[str] = set()
 
-def ensure_associated_lvs(lvs, lv_tags={}):
-    """
-    Go through each LV and ensure if backing devices (journal, wal, block)
-    are LVs or partitions, so that they can be accurately reported.
-    """
-    # look for many LVs for each backing type, because it is possible to
-    # receive a filtering for osd.1, and have multiple failed deployments
-    # leaving many journals with osd.1 - usually, only a single LV will be
-    # returned
-
-    db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'}))
-    wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'}))
-    backing_devices = [(db_lvs, 'db'),
-                       (wal_lvs, 'wal')]
-
-    verified_devices = []
-
-    for lv in lvs:
-        # go through each lv and append it, otherwise query `blkid` to find
-        # a physical device. Do this for each type (journal,db,wal) regardless
-        # if they have been processed in the previous LV, so that bad devices
-        # with the same ID can be caught
-        for ceph_lvs, _type in backing_devices:
-            if ceph_lvs:
-                verified_devices.extend([l.lv_path for l in ceph_lvs])
-                continue
-
-            # must be a disk partition, by querying blkid by the uuid we are
-            # ensuring that the device path is always correct
-            try:
-                device_uuid = lv.tags['ceph.%s_uuid' % _type]
-            except KeyError:
-                # Bluestore will not have ceph.journal_uuid, and Filestore
-                # will not not have ceph.db_uuid
-                continue
+        if len([details.get('osd_id') for _, details in raw_report.items() if details.get('osd_id') == osd_id]) > 1:
+            if not osd_uuid:
+                raise RuntimeError(f'Multiple OSDs found with id {osd_id}, pass --osd-fsid')
 
-            osd_device = disk.get_device_from_partuuid(device_uuid)
-            if not osd_device:
-                # if the osd_device is not found by the partuuid, then it is
-                # not possible to ensure this device exists anymore, so skip it
-                continue
-            verified_devices.append(osd_device)
+        if not osd_uuid:
+            for _, details in raw_report.items():
+                if details.get('osd_id') == int(osd_id):
+                    osd_uuid = details.get('osd_uuid')
+                    break
 
-        verified_devices.append(lv.lv_path)
+        for osd_uuid, details in raw_report.items():
+            device: str = details.get('device')
+            if details.get('osd_uuid') == osd_uuid:
+                raw_devices.add(device)
 
-    # reduce the list from all the duplicates that were added
-    return list(set(verified_devices))
+        return list(raw_devices)
+        
 
+    def find_associated_devices(self) -> List[api.Volume]:
+        """From an ``osd_id`` and/or an ``osd_fsid``, filter out all the Logical Volumes (LVs) in the
+        system that match those tag values, further detect if any partitions are
+        part of the OSD, and then return the set of LVs and partitions (if any).
 
-class Zap(object):
+        The function first queries the LVM-based OSDs using the provided `osd_id` or `osd_fsid`.
+        If no matches are found, it then searches the system for RAW-based OSDs.
 
-    help = 'Removes all data and filesystems from a logical volume or partition.'
+        Raises:
+            SystemExit: If no OSDs are found, the function raises a `SystemExit` with an appropriate message.
 
-    def __init__(self, argv):
-        self.argv = argv
+        Returns:
+            List[api.Volume]: A list of `api.Volume` objects corresponding to the OSD's Logical Volumes (LVs)
+            or partitions that are associated with the given `osd_id` or `osd_fsid`.
 
-    def unmount_lv(self, lv):
+        Notes:
+            - If neither `osd_id` nor `osd_fsid` are provided, the function will not be able to find OSDs.
+            - The search proceeds from LVM-based OSDs to RAW-based OSDs if no Logical Volumes are found.
+        """
+        lv_tags = {}
+        lv_tags = {key: value for key, value in {
+            'ceph.osd_id': self.args.osd_id,
+            'ceph.osd_fsid': self.args.osd_fsid
+        }.items() if value}
+        devices_to_zap: List[str] = []
+        lvs = api.get_lvs(tags=lv_tags)
+
+        if lvs:
+            devices_to_zap = self.ensure_associated_lvs(lvs, lv_tags)
+        else:
+            mlogger.debug(f'No OSD identified by "{self.args.osd_id or self.args.osd_fsid}" was found among LVM-based OSDs.')
+            mlogger.debug('Proceeding to check RAW-based OSDs.')
+            raw_osds: Dict[str, Any] = direct_report()
+            if raw_osds:
+                devices_to_zap = self.ensure_associated_raw(raw_osds)
+        if not devices_to_zap:
+            raise SystemExit('No OSD were found.')
+
+        return [Device(path) for path in set(devices_to_zap) if path]
+
+    def ensure_associated_lvs(self,
+                              lvs: List[api.Volume],
+                              lv_tags: Dict[str, Any] = {}) -> List[str]:
+        """
+        Go through each LV and ensure if backing devices (journal, wal, block)
+        are LVs or partitions, so that they can be accurately reported.
+        """
+        # look for many LVs for each backing type, because it is possible to
+        # receive a filtering for osd.1, and have multiple failed deployments
+        # leaving many journals with osd.1 - usually, only a single LV will be
+        # returned
+
+        db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'}))
+        wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'}))
+        backing_devices = [(db_lvs, 'db'),
+                        (wal_lvs, 'wal')]
+
+        verified_devices = []
+
+        for lv in lvs:
+            # go through each lv and append it, otherwise query `blkid` to find
+            # a physical device. Do this for each type (journal,db,wal) regardless
+            # if they have been processed in the previous LV, so that bad devices
+            # with the same ID can be caught
+            for ceph_lvs, _type in backing_devices:
+                if ceph_lvs:
+                    verified_devices.extend([l.lv_path for l in ceph_lvs])
+                    continue
+
+                # must be a disk partition, by querying blkid by the uuid we are
+                # ensuring that the device path is always correct
+                try:
+                    device_uuid = lv.tags['ceph.%s_uuid' % _type]
+                except KeyError:
+                    # Bluestore will not have ceph.journal_uuid, and Filestore
+                    # will not not have ceph.db_uuid
+                    continue
+
+                osd_device = disk.get_device_from_partuuid(device_uuid)
+                if not osd_device:
+                    # if the osd_device is not found by the partuuid, then it is
+                    # not possible to ensure this device exists anymore, so skip it
+                    continue
+                verified_devices.append(osd_device)
+
+            verified_devices.append(lv.lv_path)
+
+        # reduce the list from all the duplicates that were added
+        return list(set(verified_devices))
+
+    def unmount_lv(self, lv: api.Volume) -> None:
         if lv.tags.get('ceph.cluster_name') and lv.tags.get('ceph.osd_id'):
             lv_path = "/var/lib/ceph/osd/{}-{}".format(lv.tags['ceph.cluster_name'], lv.tags['ceph.osd_id'])
         else:
@@ -186,40 +230,95 @@ class Zap(object):
         if dmcrypt and dmcrypt_uuid:
             self.dmcrypt_close(dmcrypt_uuid)
 
-    def zap_lv(self, device):
+    def _write_replacement_header(self, device: str) -> None:
+        """Write a replacement header to a device.
+
+        This method writes the string defined in `BEING_REPLACED_HEADER`
+        to the specified device. This header indicates that the device
+        is in the process of being replaced.
+
+        Args:
+            device (str): The path to the device on which the replacement
+                          header will be written.
+        """
+        disk._dd_write(device,
+                       BEING_REPLACED_HEADER)
+
+    def clear_replace_header(self) -> bool:
+        """Safely erase the replacement header on a device if it is marked as being replaced.
+
+        This method checks whether the given device is marked as being replaced
+        (`device.is_being_replaced`). If true, it proceeds to erase the replacement header
+        from the device using the `_erase_replacement_header` method. The method returns
+        a boolean indicating whether any action was taken.
+
+        Args:
+            device (Device): The device object, which includes information about the device's
+                            path and status (such as whether it is currently being replaced).
+
+        Returns:
+            bool: True if the replacement header was successfully erased, False if the
+                device was not marked as being replaced or no action was necessary.
+        """
+        result: bool = False
+        device: Device = self.args.clear_replace_header
+        if device.is_being_replaced:
+            self._erase_replacement_header(device.path)
+            result = True
+        return result
+
+    def _erase_replacement_header(self, device: str) -> None:
+        """Erase the replacement header on a device.
+
+        This method writes a sequence of null bytes (`0x00`) over the area of the device
+        where the replacement header is stored, effectively erasing it.
+
+        Args:
+            device (str): The path to the device from which the replacement header will be erased.
+        """
+        disk._dd_write(device,
+                       b'\x00' * len(BEING_REPLACED_HEADER))
+
+    def zap_lv(self, device: Device) -> None:
         """
         Device examples: vg-name/lv-name, /dev/vg-name/lv-name
         Requirements: Must be a logical volume (LV)
         """
-        lv = api.get_single_lv(filters={'lv_name': device.lv_name, 'vg_name':
-                                        device.vg_name})
+        lv: api.Volume = device.lv_api
         self.unmount_lv(lv)
-
+        self.parent_device: str = disk.get_parent_device_from_mapper(lv.lv_path)
         zap_device(device.path)
 
         if self.args.destroy:
             lvs = api.get_lvs(filters={'vg_name': device.vg_name})
-            if lvs == []:
-                mlogger.info('No LVs left, exiting', device.vg_name)
-                return
-            elif len(lvs) <= 1:
+            if len(lvs) <= 1:
                 mlogger.info('Only 1 LV left in VG, will proceed to destroy '
                              'volume group %s', device.vg_name)
                 pvs = api.get_pvs(filters={'lv_uuid': lv.lv_uuid})
                 api.remove_vg(device.vg_name)
                 for pv in pvs:
                     api.remove_pv(pv.pv_name)
+                replacement_args: Dict[str, bool] = {
+                    'block': self.args.replace_block,
+                    'db': self.args.replace_db,
+                    'wal': self.args.replace_wal
+                }
+                if replacement_args.get(lv.tags.get('ceph.type'), False):
+                    mlogger.info(f'Marking {self.parent_device} as being replaced')
+                    self._write_replacement_header(self.parent_device)
             else:
                 mlogger.info('More than 1 LV left in VG, will proceed to '
                              'destroy LV only')
                 mlogger.info('Removing LV because --destroy was given: %s',
                              device.path)
+                if self.args.replace_block:
+                    mlogger.info(f'--replace-block passed but the device still has {str(len(lvs))} LV(s)')
                 api.remove_lv(device.path)
         elif lv:
             # just remove all lvm metadata, leaving the LV around
             lv.clear_tags()
 
-    def zap_partition(self, device):
+    def zap_partition(self, device: Device) -> None:
         """
         Device example: /dev/sda1
         Requirements: Must be a partition
@@ -247,7 +346,7 @@ class Zap(object):
             mlogger.info("Destroying partition since --destroy was used: %s" % device.path)
             disk.remove_partition(device)
 
-    def zap_lvm_member(self, device):
+    def zap_lvm_member(self, device: Device) -> None:
         """
         An LVM member may have more than one LV and or VG, for example if it is
         a raw device with multiple partitions each belonging to a different LV
@@ -267,7 +366,7 @@ class Zap(object):
 
 
 
-    def zap_raw_device(self, device):
+    def zap_raw_device(self, device: Device) -> None:
         """
         Any whole (raw) device passed in as input will be processed here,
         checking for LVM membership and partitions (if any).
@@ -287,11 +386,19 @@ class Zap(object):
             self.zap_partition(Device('/dev/%s' % part_name))
 
         zap_device(device.path)
+        # TODO(guits): I leave this commented out, this should be part of a separate patch in order to
+        # support device replacement with raw-based OSDs
+        # if self.args.replace_block:
+        #     disk._dd_write(device.path, 'CEPH_DEVICE_BEING_REPLACED')
 
     @decorators.needs_root
-    def zap(self, devices=None):
-        devices = devices or self.args.devices
+    def zap(self) -> None:
+        """Zap a device.
 
+        Raises:
+            SystemExit: When the device is a mapper and not a mpath device.
+        """
+        devices = self.args.devices
         for device in devices:
             mlogger.info("Zapping: %s", device.path)
             if device.is_mapper and not device.is_mpath:
@@ -317,21 +424,21 @@ class Zap(object):
             )
 
     @decorators.needs_root
-    def zap_osd(self):
+    def zap_osd(self) -> None:
         if self.args.osd_id and not self.args.no_systemd:
             osd_is_running = systemctl.osd_is_active(self.args.osd_id)
             if osd_is_running:
                 mlogger.error("OSD ID %s is running, stop it with:" % self.args.osd_id)
                 mlogger.error("systemctl stop ceph-osd@%s" % self.args.osd_id)
                 raise SystemExit("Unable to zap devices associated with OSD ID: %s" % self.args.osd_id)
-        devices = find_associated_devices(self.args.osd_id, self.args.osd_fsid)
-        self.zap(devices)
+        self.args.devices = self.find_associated_devices()
+        self.zap()
 
-    def dmcrypt_close(self, dmcrypt_uuid):
+    def dmcrypt_close(self, dmcrypt_uuid: str) -> None:
         mlogger.info("Closing encrypted volume %s", dmcrypt_uuid)
         encryption.dmcrypt_close(mapping=dmcrypt_uuid, skip_path_check=True)
 
-    def main(self):
+    def main(self) -> None:
         sub_command_help = dedent("""
         Zaps the given logical volume(s), raw device(s) or partition(s) for reuse by ceph-volume.
         If given a path to a logical volume it must be in the format of vg/lv. Any
@@ -419,12 +526,56 @@ class Zap(object):
             help='Skip systemd unit checks',
         )
 
+        parser.add_argument(
+            '--replace-block',
+            dest='replace_block',
+            action='store_true',
+            help='Mark the block device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--replace-db',
+            dest='replace_db',
+            action='store_true',
+            help='Mark the db device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--replace-wal',
+            dest='replace_wal',
+            action='store_true',
+            help='Mark the wal device as unavailable.'
+        )
+
+        parser.add_argument(
+            '--clear-replace-header',
+            dest='clear_replace_header',
+            type=arg_validators.ValidClearReplaceHeaderDevice(),
+            help='clear the replace header on devices.'
+        )
+
         if len(self.argv) == 0:
             print(sub_command_help)
             return
 
         self.args = parser.parse_args(self.argv)
 
+        if self.args.clear_replace_header:
+            rc: bool = False
+            try:
+                rc = self.clear_replace_header()
+            except Exception as e:
+                raise SystemExit(e)
+            if rc:
+                mlogger.info(f'Replacement header cleared on {self.args.clear_replace_header}')
+            else:
+                mlogger.info(f'No replacement header detected on {self.args.clear_replace_header}, nothing to do.')
+            raise SystemExit(not rc)
+
+        if self.args.replace_block or self.args.replace_db or self.args.replace_wal:
+            self.args.destroy = True
+            mlogger.info('--replace-block|db|wal passed, enforcing --destroy.')
+
         if self.args.osd_id or self.args.osd_fsid:
             self.zap_osd()
         else:
diff --git a/src/ceph-volume/ceph_volume/devices/raw/list.py b/src/ceph-volume/ceph_volume/devices/raw/list.py
index f6ac08eab98..68923216a41 100644
--- a/src/ceph-volume/ceph_volume/devices/raw/list.py
+++ b/src/ceph-volume/ceph_volume/devices/raw/list.py
@@ -5,12 +5,14 @@ import logging
 from textwrap import dedent
 from ceph_volume import decorators, process
 from ceph_volume.util import disk
-from typing import Any, Dict, List as _List
+from ceph_volume.util.device import Device
+from typing import Any, Dict, Optional, List as _List
+from concurrent.futures import ThreadPoolExecutor
 
 logger = logging.getLogger(__name__)
 
 
-def direct_report(devices):
+def direct_report(devices: Optional[_List[str]] = None) -> Dict[str, Any]:
     """
     Other non-cli consumers of listing information will want to consume the
     report without the need to parse arguments or other flags. This helper
@@ -20,27 +22,29 @@ def direct_report(devices):
     _list = List([])
     return _list.generate(devices)
 
-def _get_bluestore_info(dev: str) -> Dict[str, Any]:
+def _get_bluestore_info(devices: _List[str]) -> Dict[str, Any]:
     result: Dict[str, Any] = {}
-    out, err, rc = process.call([
-        'ceph-bluestore-tool', 'show-label',
-        '--dev', dev], verbose_on_failure=False)
+    command: _List[str] = ['ceph-bluestore-tool',
+                           'show-label', '--bdev_aio_poll_ms=1']
+    for device in devices:
+        command.extend(['--dev', device])
+    out, err, rc = process.call(command, verbose_on_failure=False)
     if rc:
-        # ceph-bluestore-tool returns an error (below) if device is not bluestore OSD
-        #   > unable to read label for <device>: (2) No such file or directory
-        # but it's possible the error could be for a different reason (like if the disk fails)
-        logger.debug(f'assuming device {dev} is not BlueStore; ceph-bluestore-tool failed to get info from device: {out}\n{err}')
+        logger.debug(f"ceph-bluestore-tool couldn't detect any BlueStore device.\n{out}\n{err}")
     else:
         oj = json.loads(''.join(out))
-        if dev not in oj:
-            # should be impossible, so warn
-            logger.warning(f'skipping device {dev} because it is not reported in ceph-bluestore-tool output: {out}')
-        try:
-            result = disk.bluestore_info(dev, oj)
-        except KeyError as e:
-            # this will appear for devices that have a bluestore header but aren't valid OSDs
-            # for example, due to incomplete rollback of OSDs: https://tracker.ceph.com/issues/51869
-            logger.error(f'device {dev} does not have all BlueStore data needed to be a valid OSD: {out}\n{e}')
+        for device in devices:
+            if device not in oj:
+                # should be impossible, so warn
+                logger.warning(f'skipping device {device} because it is not reported in ceph-bluestore-tool output: {out}')
+            if oj.get(device):
+                try:
+                    osd_uuid = oj[device]['osd_uuid']
+                    result[osd_uuid] = disk.bluestore_info(device, oj)
+                except KeyError as e:
+                    # this will appear for devices that have a bluestore header but aren't valid OSDs
+                    # for example, due to incomplete rollback of OSDs: https://tracker.ceph.com/issues/51869
+                    logger.error(f'device {device} does not have all BlueStore data needed to be a valid OSD: {out}\n{e}')
     return result
 
 
@@ -50,68 +54,67 @@ class List(object):
 
     def __init__(self, argv: _List[str]) -> None:
         self.argv = argv
-
-    def is_atari_partitions(self, _lsblk: Dict[str, Any]) -> bool:
-        dev = _lsblk['NAME']
-        if _lsblk.get('PKNAME'):
-            parent = _lsblk['PKNAME']
-            try:
-                if disk.has_bluestore_label(parent):
-                    logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(dev, parent),
-                                    'device is likely a phantom Atari partition. device info: {}'.format(_lsblk)))
-                    return True
-            except OSError as e:
-                logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(dev),
-                            'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e)))
-                return True
-        return False
-
-    def exclude_atari_partitions(self, _lsblk_all: Dict[str, Any]) -> _List[Dict[str, Any]]:
-        return [_lsblk for _lsblk in _lsblk_all if not self.is_atari_partitions(_lsblk)]
-
-    def generate(self, devs=None):
+        self.info_devices: _List[Dict[str, str]] = []
+        self.devices_to_scan: _List[str] = []
+
+    def exclude_atari_partitions(self) -> None:
+        result: _List[str] = []
+        for info_device in self.info_devices:
+            path = info_device['NAME']
+            parent_device = info_device.get('PKNAME')
+            if parent_device:
+                try:
+                    if disk.has_bluestore_label(parent_device):
+                        logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(path, parent_device),
+                                        'device is likely a phantom Atari partition. device info: {}'.format(info_device)))
+                        continue
+                except OSError as e:
+                    logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(path),
+                                'failed to determine if parent device {} is BlueStore. err: {}'.format(parent_device, e)))
+                    continue
+            result.append(path)
+        self.devices_to_scan = result
+
+    def exclude_lvm_osd_devices(self) -> None:
+        with ThreadPoolExecutor() as pool:
+            filtered_devices_to_scan = pool.map(self.filter_lvm_osd_devices, self.devices_to_scan)
+            self.devices_to_scan = [device for device in filtered_devices_to_scan if device is not None]
+
+    def filter_lvm_osd_devices(self, device: str) -> Optional[str]:
+        d = Device(device)
+        return d.path if not d.ceph_device_lvm else None
+
+    def generate(self, devices: Optional[_List[str]] = None) -> Dict[str, Any]:
         logger.debug('Listing block devices via lsblk...')
-        info_devices = []
-        if not devs or not any(devs):
+        if not devices or not any(devices):
             # If no devs are given initially, we want to list ALL devices including children and
             # parents. Parent disks with child partitions may be the appropriate device to return if
             # the parent disk has a bluestore header, but children may be the most appropriate
             # devices to return if the parent disk does not have a bluestore header.
-            info_devices = disk.lsblk_all(abspath=True)
-            devs = [device['NAME'] for device in info_devices if device.get('NAME',)]
+            self.info_devices = disk.lsblk_all(abspath=True)
+            # Linux kernels built with CONFIG_ATARI_PARTITION enabled can falsely interpret
+            # bluestore's on-disk format as an Atari partition table. These false Atari partitions
+            # can be interpreted as real OSDs if a bluestore OSD was previously created on the false
+            # partition. See https://tracker.ceph.com/issues/52060 for more info. If a device has a
+            # parent, it is a child. If the parent is a valid bluestore OSD, the child will only
+            # exist if it is a phantom Atari partition, and the child should be ignored. If the
+            # parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to
+            # determine whether a parent is bluestore, we should err on the side of not reporting
+            # the child so as not to give a false negative.
+            self.exclude_atari_partitions()
+            self.exclude_lvm_osd_devices()
+
         else:
-            for dev in devs:
-                info_devices.append(disk.lsblk(dev, abspath=True))
-
-        # Linux kernels built with CONFIG_ATARI_PARTITION enabled can falsely interpret
-        # bluestore's on-disk format as an Atari partition table. These false Atari partitions
-        # can be interpreted as real OSDs if a bluestore OSD was previously created on the false
-        # partition. See https://tracker.ceph.com/issues/52060 for more info. If a device has a
-        # parent, it is a child. If the parent is a valid bluestore OSD, the child will only
-        # exist if it is a phantom Atari partition, and the child should be ignored. If the
-        # parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to
-        # determine whether a parent is bluestore, we should err on the side of not reporting
-        # the child so as not to give a false negative.
-        info_devices = self.exclude_atari_partitions(info_devices)
-
-        result = {}
-        logger.debug('inspecting devices: {}'.format(devs))
-        for info_device in info_devices:
-            bs_info = _get_bluestore_info(info_device['NAME'])
-            if not bs_info:
-                # None is also returned in the rare event that there is an issue reading info from
-                # a BlueStore disk, so be sure to log our assumption that it isn't bluestore
-                logger.info('device {} does not have BlueStore information'.format(info_device['NAME']))
-                continue
-            uuid = bs_info['osd_uuid']
-            if uuid not in result:
-                result[uuid] = {}
-            result[uuid].update(bs_info)
+            self.devices_to_scan = devices
+
+        result: Dict[str, Any] = {}
+        logger.debug('inspecting devices: {}'.format(self.devices_to_scan))
+        result = _get_bluestore_info(self.devices_to_scan)
 
         return result
 
     @decorators.needs_root
-    def list(self, args):
+    def list(self, args: argparse.Namespace) -> None:
         report = self.generate(args.device)
         if args.format == 'json':
             print(json.dumps(report, indent=4, sort_keys=True))
@@ -120,7 +123,7 @@ class List(object):
                 raise SystemExit('No valid Ceph devices found')
             raise RuntimeError('not implemented yet')
 
-    def main(self):
+    def main(self) -> None:
         sub_command_help = dedent("""
         List OSDs on raw devices with raw device labels (usually the first
         block of the device).
diff --git a/src/ceph-volume/ceph_volume/tests/api/test_lvm.py b/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
index 9ad2f701f12..6a5eee0e1b8 100644
--- a/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
+++ b/src/ceph-volume/ceph_volume/tests/api/test_lvm.py
@@ -883,15 +883,3 @@ class TestGetSingleLV(object):
 
         assert isinstance(lv_, api.Volume)
         assert lv_.name == 'lv1'
-
-
-class TestHelpers:
-    def test_get_lv_path_from_mapper(self):
-        mapper = '/dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9-osd--block--32e8e896--172e--4a38--a06a--3702598510ec'
-        lv_path = api.get_lv_path_from_mapper(mapper)
-        assert lv_path == '/dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec'
-
-    def test_get_mapper_from_lv_path(self):
-        lv_path = '/dev/ceph-c1a97e46-234c-46aa-a549-3ca1d1f356a9/osd-block-32e8e896-172e-4a38-a06a-3702598510ec'
-        mapper = api.get_mapper_from_lv_path(lv_path)
-        assert mapper == '/dev/mapper/ceph--c1a97e46--234c--46aa--a549--3ca1d1f356a9/osd--block--32e8e896--172e--4a38--a06a/3702598510ec'
diff --git a/src/ceph-volume/ceph_volume/tests/conftest.py b/src/ceph-volume/ceph_volume/tests/conftest.py
index ee58081d97d..e6bf31737b6 100644
--- a/src/ceph-volume/ceph_volume/tests/conftest.py
+++ b/src/ceph-volume/ceph_volume/tests/conftest.py
@@ -360,7 +360,7 @@ def device_info(monkeypatch, patch_bluestore_label):
               has_bluestore_label=False):
         if devices:
             for dev in devices.keys():
-                devices[dev]['device_nodes'] = os.path.basename(dev)
+                devices[dev]['device_nodes'] = [os.path.basename(dev)]
         else:
             devices = {}
         lsblk = lsblk if lsblk else {}
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
new file mode 100644
index 00000000000..cca64e83ab0
--- /dev/null
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
@@ -0,0 +1,81 @@
+ceph_bluestore_tool_output = '''
+{
+    "/dev/sdb": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 1099511627776,
+        "btime": "2021-07-23T16:02:22.809186+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "1099511627776",
+        "bluefs": "1",
+        "ceph_fsid": "sdb-fsid",
+        "ceph_version_when_created": "ceph version 19.3.0-5537-gb9ba4e48 (b9ba4e48633d6d90d5927a4e66b9ecbb4d7e6e73) squid (dev)",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "0"
+    },
+    "/dev/vdx": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 214748364800,
+        "btime": "2024-10-16T10:51:05.955279+0000",
+        "description": "main",
+        "bfm_blocks": "52428800",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "214748364800",
+        "bluefs": "1",
+        "ceph_fsid": "2d20bc8c-8a0c-11ef-aaba-525400e54507",
+        "ceph_version_when_created": "ceph version 19.3.0-5537-gb9ba4e48 (b9ba4e48633d6d90d5927a4e66b9ecbb4d7e6e73) squid (dev)",
+        "created_at": "2024-10-16T10:51:09.121455Z",
+        "elastic_shared_blobs": "1",
+        "epoch": "16",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "multi": "yes",
+        "osd_key": "AQCZmg9nxOKTCBAA6EQftuqMuKMHqypSAfqBsQ==",
+        "ready": "ready",
+        "type": "bluestore",
+        "whoami": "5"
+    },
+    "/dev/vdy": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 214748364800,
+        "btime": "2024-10-16T10:51:05.961279+0000",
+        "description": "bluefs db"
+    },
+    "/dev/vdz": {
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "size": 214748364800,
+        "btime": "2024-10-16T10:51:05.961279+0000",
+        "description": "bluefs wal"
+    }
+}
+'''.split('\n')
+
+lsblk_all = ['NAME="/dev/sdb" KNAME="/dev/sdb" PKNAME="" PARTLABEL=""',
+             'NAME="/dev/sdx" KNAME="/dev/sdx" PKNAME="" PARTLABEL=""',
+             'NAME="/dev/sdy" KNAME="/dev/sdy" PKNAME="" PARTLABEL=""',
+             'NAME="/dev/sdz" KNAME="/dev/sdz" PKNAME="" PARTLABEL=""']
+
+blkid_output = ['/dev/ceph-1172bba3-3e0e-45e5-ace6-31ae8401221f/osd-block-5050a85c-d1a7-4d66-b4ba-2e9b1a2970ae: TYPE="ceph_bluestore" USAGE="other"']
+
+udevadm_property = '''DEVNAME=/dev/sdb
+DEVTYPE=disk
+ID_ATA=1
+ID_BUS=ata
+ID_MODEL=SK_hynix_SC311_SATA_512GB
+ID_PART_TABLE_TYPE=gpt
+ID_PART_TABLE_UUID=c8f91d57-b26c-4de1-8884-0c9541da288c
+ID_PATH=pci-0000:00:17.0-ata-3
+ID_PATH_TAG=pci-0000_00_17_0-ata-3
+ID_REVISION=70000P10
+ID_SERIAL=SK_hynix_SC311_SATA_512GB_MS83N71801150416A
+TAGS=:systemd:
+USEC_INITIALIZED=16117769'''.split('\n')
+\ No newline at end of file
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
index d630a7a6bf8..d9b3bdfd239 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_zap.py
@@ -1,3 +1,4 @@
+# type: ignore
 import os
 import pytest
 from copy import deepcopy
@@ -5,16 +6,54 @@ from mock.mock import patch, call, Mock
 from ceph_volume import process
 from ceph_volume.api import lvm as api
 from ceph_volume.devices.lvm import zap
-
-
-class TestZap(object):
-    def test_invalid_osd_id_passed(self):
+from . import data_zap
+from typing import Tuple, List
+
+
+def process_call(command, **kw):
+    result: Tuple[List[str], List[str], int] = ''
+    if 'udevadm' in command:
+        result = data_zap.udevadm_property, [], 0
+    if 'ceph-bluestore-tool' in command:
+        result = data_zap.ceph_bluestore_tool_output, [], 0
+    if 'is-active' in command:
+        result = [], [], 1
+    if 'lsblk' in command:
+        result = data_zap.lsblk_all, [], 0
+    if 'blkid' in command:
+        result = data_zap.blkid_output, [], 0
+    if 'pvs' in command:
+        result = [], [], 0
+    return result
+
+
+class TestZap:
+    def test_invalid_osd_id_passed(self) -> None:
         with pytest.raises(SystemExit):
             zap.Zap(argv=['--osd-id', 'foo']).main()
 
-class TestFindAssociatedDevices(object):
-
-    def test_no_lvs_found_that_match_id(self, monkeypatch, device_info):
+    @patch('ceph_volume.util.disk._dd_write', Mock())
+    @patch('ceph_volume.util.arg_validators.Device')
+    def test_clear_replace_header_is_being_replaced(self, m_device: Mock) -> None:
+        m_dev = m_device.return_value
+        m_dev.is_being_replaced = True
+        with pytest.raises(SystemExit) as e:
+            zap.Zap(argv=['--clear', '/dev/foo']).main()
+        assert e.value.code == 0
+
+    @patch('ceph_volume.util.disk._dd_write', Mock())
+    @patch('ceph_volume.util.arg_validators.Device')
+    def test_clear_replace_header_is_not_being_replaced(self, m_device: Mock) -> None:
+        m_dev = m_device.return_value
+        m_dev.is_being_replaced = False
+        with pytest.raises(SystemExit) as e:
+            zap.Zap(argv=['--clear', '/dev/foo']).main()
+        assert e.value.code == 1
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_no_lvs_and_raw_found_that_match_id(self, is_root, monkeypatch, device_info):
         tags = 'ceph.osd_id=9,ceph.journal_uuid=x,ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='vg',
                          lv_tags=tags, lv_path='/dev/VolGroup/lv')
@@ -22,10 +61,15 @@ class TestFindAssociatedDevices(object):
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_id=10)
+        z = zap.Zap(['--osd-id', '10'])
 
-    def test_no_lvs_found_that_match_fsid(self, monkeypatch, device_info):
+        with pytest.raises(SystemExit):
+            z.main()
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_no_lvs_and_raw_found_that_match_fsid(self, is_root, monkeypatch):
         tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,'+\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', lv_tags=tags,
@@ -34,10 +78,15 @@ class TestFindAssociatedDevices(object):
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_fsid='aaaa-lkjh')
+        z = zap.Zap(['--osd-fsid', 'aaaa-lkjh'])
 
-    def test_no_lvs_found_that_match_id_fsid(self, monkeypatch, device_info):
+        with pytest.raises(SystemExit):
+            z.main()
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_no_lvs_and_raw_found_that_match_id_fsid(self, is_root, monkeypatch):
         tags = 'ceph.osd_id=9,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,'+\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='vg',
@@ -46,45 +95,82 @@ class TestFindAssociatedDevices(object):
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_id='9', osd_fsid='aaaa-lkjh')
+        z = zap.Zap(['--osd-id', '9', '--osd-fsid', 'aaaa-lkjh'])
 
-    def test_no_ceph_lvs_found(self, monkeypatch):
+        with pytest.raises(SystemExit):
+            z.main()
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    def test_no_ceph_lvs_and_no_ceph_raw_found(self, is_root, monkeypatch):
         osd = api.Volume(lv_name='volume1', lv_uuid='y', lv_tags='',
                          lv_path='/dev/VolGroup/lv')
         volumes = []
         volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kwargs: {})
 
-        with pytest.raises(RuntimeError):
-            zap.find_associated_devices(osd_id=100)
+        z = zap.Zap(['--osd-id', '100'])
 
-    def test_lv_is_matched_id(self, monkeypatch):
+        with pytest.raises(SystemExit):
+            z.main()
+
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_lv_is_matched_id(self, mock_zap, monkeypatch, is_root):
         tags = 'ceph.osd_id=0,ceph.journal_uuid=x,ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='',
                          lv_path='/dev/VolGroup/lv', lv_tags=tags)
+        volumes = [osd]
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
+
+        z = zap.Zap(['--osd-id', '0'])
+        z.main()
+        assert z.args.devices[0].path == '/dev/VolGroup/lv'
+        mock_zap.assert_called_once()
+
+    # @patch('ceph_volume.devices.lvm.zap.disk.has_bluestore_label', Mock(return_value=True))
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_is_matched_id(self, mock_zap, monkeypatch, is_root):
         volumes = []
-        volumes.append(osd)
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
-        monkeypatch.setattr(process, 'call', lambda x, **kw: ('', '', 0))
 
-        result = zap.find_associated_devices(osd_id='0')
-        assert result[0].path == '/dev/VolGroup/lv'
+        z = zap.Zap(['--osd-id', '0'])
+        z.main()
+        assert z.args.devices[0].path == '/dev/sdb'
+        mock_zap.assert_called_once()
 
-    def test_lv_is_matched_fsid(self, monkeypatch):
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    def test_lv_is_matched_fsid(self, mock_zap, monkeypatch, is_root):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,' +\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='',
                          lv_path='/dev/VolGroup/lv', lv_tags=tags)
-        volumes = []
-        volumes.append(osd)
+        volumes = [osd]
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: deepcopy(volumes))
         monkeypatch.setattr(process, 'call', lambda x, **kw: ('', '', 0))
 
-        result = zap.find_associated_devices(osd_fsid='asdf-lkjh')
-        assert result[0].path == '/dev/VolGroup/lv'
+        z = zap.Zap(['--osd-fsid', 'asdf-lkjh'])
+        z.main()
+
+        assert z.args.devices[0].path == '/dev/VolGroup/lv'
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_is_matched_fsid(self, mock_zap, monkeypatch, is_root):
+        volumes = []
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
+
+        z = zap.Zap(['--osd-fsid', 'd5a496bc-dcb9-4ad0-a12c-393d3200d2b6'])
+        z.main()
+
+        assert z.args.devices[0].path == '/dev/sdb'
+        mock_zap.assert_called_once
 
-    def test_lv_is_matched_id_fsid(self, monkeypatch):
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    def test_lv_is_matched_id_fsid(self, mock_zap, monkeypatch, is_root):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,' +\
                'ceph.type=data'
         osd = api.Volume(lv_name='volume1', lv_uuid='y', vg_name='',
@@ -94,26 +180,43 @@ class TestFindAssociatedDevices(object):
         monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
         monkeypatch.setattr(process, 'call', lambda x, **kw: ('', '', 0))
 
-        result = zap.find_associated_devices(osd_id='0', osd_fsid='asdf-lkjh')
-        assert result[0].path == '/dev/VolGroup/lv'
+        z = zap.Zap(['--osd-id', '0', '--osd-fsid', 'asdf-lkjh', '--no-systemd'])
+        z.main()
 
+        assert z.args.devices[0].path == '/dev/VolGroup/lv'
+        mock_zap.assert_called_once
 
-class TestEnsureAssociatedLVs(object):
-
-    @patch('ceph_volume.devices.lvm.zap.api', Mock(return_value=[]))
-    def test_nothing_is_found(self):
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(return_value='/dev/sdb'))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_is_matched_id_fsid(self, mock_zap, monkeypatch, is_root):
         volumes = []
-        result = zap.ensure_associated_lvs(volumes)
-        assert result == []
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
 
-    def test_data_is_found(self, fake_call):
-        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=data'
-        osd = api.Volume(
-            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/data', lv_tags=tags)
+        z = zap.Zap(['--osd-id', '0', '--osd-fsid', 'd5a496bc-dcb9-4ad0-a12c-393d3200d2b6'])
+        z.main()
+
+        assert z.args.devices[0].path == '/dev/sdb'
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.Zap.zap')
+    @patch('ceph_volume.devices.raw.list.List.filter_lvm_osd_devices', Mock(side_effect=['/dev/vdx', '/dev/vdy', '/dev/vdz', None]))
+    @patch('ceph_volume.process.call', Mock(side_effect=process_call))
+    def test_raw_multiple_devices(self, mock_zap, monkeypatch, is_root):
         volumes = []
-        volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
-        assert result == ['/dev/VolGroup/data']
+        monkeypatch.setattr(zap.api, 'get_lvs', lambda **kw: volumes)
+        z = zap.Zap(['--osd-id', '5'])
+        z.main()
+
+        set([device.path for device in z.args.devices]) == {'/dev/vdx', '/dev/vdy', '/dev/vdz'}
+        mock_zap.assert_called_once
+
+    @patch('ceph_volume.devices.lvm.zap.direct_report', Mock(return_value={}))
+    @patch('ceph_volume.devices.lvm.zap.api.get_lvs', Mock(return_value=[]))
+    def test_nothing_is_found(self, is_root):
+        z = zap.Zap(['--osd-id', '0'])
+        with pytest.raises(SystemExit):
+            z.main()
 
     def test_block_is_found(self, fake_call):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=block'
@@ -121,7 +224,7 @@ class TestEnsureAssociatedLVs(object):
             lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/block', lv_tags=tags)
         volumes = []
         volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert result == ['/dev/VolGroup/block']
 
     def test_success_message_for_fsid(self, factory, is_root, capsys):
@@ -140,28 +243,6 @@ class TestEnsureAssociatedLVs(object):
         out, err = capsys.readouterr()
         assert "Zapping successful for OSD: 1" in err
 
-    def test_journal_is_found(self, fake_call):
-        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
-        osd = api.Volume(
-            lv_name='volume1', lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv', lv_tags=tags)
-        volumes = []
-        volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
-        assert result == ['/dev/VolGroup/lv']
-
-    @patch('ceph_volume.api.lvm.process.call', Mock(return_value=('', '', 0)))
-    def test_multiple_journals_are_found(self):
-        tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=journal'
-        volumes = []
-        for i in range(3):
-            osd = api.Volume(
-                lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
-            volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
-        assert '/dev/VolGroup/lv0' in result
-        assert '/dev/VolGroup/lv1' in result
-        assert '/dev/VolGroup/lv2' in result
-
     @patch('ceph_volume.api.lvm.process.call', Mock(return_value=('', '', 0)))
     def test_multiple_dbs_are_found(self):
         tags = 'ceph.osd_id=0,ceph.osd_fsid=asdf-lkjh,ceph.journal_uuid=x,ceph.type=db'
@@ -170,7 +251,7 @@ class TestEnsureAssociatedLVs(object):
             osd = api.Volume(
                 lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
             volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert '/dev/VolGroup/lv0' in result
         assert '/dev/VolGroup/lv1' in result
         assert '/dev/VolGroup/lv2' in result
@@ -183,7 +264,7 @@ class TestEnsureAssociatedLVs(object):
             osd = api.Volume(
                 lv_name='volume%s' % i, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % i, lv_tags=tags)
             volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert '/dev/VolGroup/lv0' in result
         assert '/dev/VolGroup/lv1' in result
         assert '/dev/VolGroup/lv2' in result
@@ -196,14 +277,14 @@ class TestEnsureAssociatedLVs(object):
             osd = api.Volume(
                 lv_name='volume%s' % _type, lv_uuid='y', vg_name='', lv_path='/dev/VolGroup/lv%s' % _type, lv_tags=tags)
             volumes.append(osd)
-        result = zap.ensure_associated_lvs(volumes)
+        result = zap.Zap([]).ensure_associated_lvs(volumes)
         assert '/dev/VolGroup/lvjournal' in result
         assert '/dev/VolGroup/lvwal' in result
         assert '/dev/VolGroup/lvdb' in result
 
     @patch('ceph_volume.devices.lvm.zap.api.get_lvs')
     def test_ensure_associated_lvs(self, m_get_lvs):
-        zap.ensure_associated_lvs([], lv_tags={'ceph.osd_id': '1'})
+        zap.Zap([]).ensure_associated_lvs([], lv_tags={'ceph.osd_id': '1'})
         calls = [
             call(tags={'ceph.type': 'db', 'ceph.osd_id': '1'}),
             call(tags={'ceph.type': 'wal', 'ceph.osd_id': '1'})
diff --git a/src/ceph-volume/ceph_volume/tests/devices/raw/data_list.py b/src/ceph-volume/ceph_volume/tests/devices/raw/data_list.py
new file mode 100644
index 00000000000..e1d1a48967a
--- /dev/null
+++ b/src/ceph-volume/ceph_volume/tests/devices/raw/data_list.py
@@ -0,0 +1,102 @@
+ceph_bluestore_tool_show_label_output: str = '''{
+    "/dev/sdb": {
+        "osd_uuid": "sdb-uuid",
+        "size": 1099511627776,
+        "btime": "2021-07-23T16:02:22.809186+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "1099511627776",
+        "bluefs": "1",
+        "ceph_fsid": "sdb-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "0"
+    },
+    "/dev/sdb2": {
+        "osd_uuid": "sdb2-uuid",
+        "size": 1099511627776,
+        "btime": "2021-07-23T16:02:22.809186+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "1099511627776",
+        "bluefs": "1",
+        "ceph_fsid": "sdb2-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "2"
+    },
+    "/dev/sde1": {
+        "osd_uuid": "sde1-uuid",
+        "size": 214747316224,
+        "btime": "2023-07-26T13:20:19.509457+0000",
+        "description": "main",
+        "bfm_blocks": "268435456",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "214747316224",
+        "bluefs": "1",
+        "ceph_fsid": "sde1-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQCSHcFkUeLIMBAAjKqANkXafjvVISkXt6FGCA==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "1"
+    },
+    "/dev/mapper/ceph--osd--block--1": {
+        "osd_uuid": "lvm-1-uuid",
+        "size": 549751619584,
+        "btime": "2021-07-23T16:04:37.881060+0000",
+        "description": "main",
+        "bfm_blocks": "134216704",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "549751619584",
+        "bluefs": "1",
+        "ceph_fsid": "lvm-1-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "2"
+    },
+    "/dev/mapper/ceph--osd--block--1": {
+        "osd_uuid": "lvm-1-uuid",
+        "size": 549751619584,
+        "btime": "2021-07-23T16:04:37.881060+0000",
+        "description": "main",
+        "bfm_blocks": "134216704",
+        "bfm_blocks_per_key": "128",
+        "bfm_bytes_per_block": "4096",
+        "bfm_size": "549751619584",
+        "bluefs": "1",
+        "ceph_fsid": "lvm-1-fsid",
+        "kv_backend": "rocksdb",
+        "magic": "ceph osd volume v026",
+        "mkfs_done": "yes",
+        "osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
+        "ready": "ready",
+        "require_osd_release": "16",
+        "type": "bluestore",
+        "whoami": "2"
+    }
+}'''
+\ No newline at end of file
diff --git a/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py b/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
index 604fb4faa3e..23d2bfdaa2c 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/raw/test_list.py
@@ -1,5 +1,7 @@
+# type: ignore
 import pytest
-from mock.mock import patch
+from .data_list import ceph_bluestore_tool_show_label_output
+from mock.mock import patch, Mock
 from ceph_volume.devices import raw
 
 # Sample lsblk output is below that overviews the test scenario. (--json output for reader clarity)
@@ -74,98 +76,6 @@ def _lsblk_output(dev, parent=None):
     ret = 'NAME="{}" KNAME="{}" PKNAME="{}"'.format(dev, dev, parent)
     return [ret] # needs to be in a list form
 
-def _bluestore_tool_label_output_sdb():
-    return '''{
-    "/dev/sdb": {
-        "osd_uuid": "sdb-uuid",
-        "size": 1099511627776,
-        "btime": "2021-07-23T16:02:22.809186+0000",
-        "description": "main",
-        "bfm_blocks": "268435456",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "1099511627776",
-        "bluefs": "1",
-        "ceph_fsid": "sdb-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "0"
-    }
-}'''
-
-def _bluestore_tool_label_output_sdb2():
-    return '''{
-    "/dev/sdb2": {
-        "osd_uuid": "sdb2-uuid",
-        "size": 1099511627776,
-        "btime": "2021-07-23T16:02:22.809186+0000",
-        "description": "main",
-        "bfm_blocks": "268435456",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "1099511627776",
-        "bluefs": "1",
-        "ceph_fsid": "sdb2-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQAO6PpgK+y4CBAAixq/X7OVimbaezvwD/cDmg==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "2"
-    }
-}'''
-
-def _bluestore_tool_label_output_sde1():
-    return '''{
-    "/dev/sde1": {
-        "osd_uuid": "sde1-uuid",
-        "size": 214747316224,
-        "btime": "2023-07-26T13:20:19.509457+0000",
-        "description": "main",
-        "bfm_blocks": "268435456",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "214747316224",
-        "bluefs": "1",
-        "ceph_fsid": "sde1-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQCSHcFkUeLIMBAAjKqANkXafjvVISkXt6FGCA==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "1"
-    }
-}'''
-
-def _bluestore_tool_label_output_dm_okay():
-    return '''{
-    "/dev/mapper/ceph--osd--block--1": {
-        "osd_uuid": "lvm-1-uuid",
-        "size": 549751619584,
-        "btime": "2021-07-23T16:04:37.881060+0000",
-        "description": "main",
-        "bfm_blocks": "134216704",
-        "bfm_blocks_per_key": "128",
-        "bfm_bytes_per_block": "4096",
-        "bfm_size": "549751619584",
-        "bluefs": "1",
-        "ceph_fsid": "lvm-1-fsid",
-        "kv_backend": "rocksdb",
-        "magic": "ceph osd volume v026",
-        "mkfs_done": "yes",
-        "osd_key": "AQCU6Ppgz+UcIRAAh6IUjtPjiXBlEXfwO8ixzw==",
-        "ready": "ready",
-        "require_osd_release": "16",
-        "whoami": "2"
-    }
-}'''
-
 def _process_call_side_effect(command, **kw):
     if "lsblk" in command:
         if "/dev/" in command[-1]:
@@ -186,19 +96,7 @@ def _process_call_side_effect(command, **kw):
         pytest.fail('command {} needs behavior specified for it'.format(command))
 
     if "ceph-bluestore-tool" in command:
-        if "/dev/sdb" in command:
-            # sdb is a bluestore OSD
-            return _bluestore_tool_label_output_sdb(), '', 0
-        if "/dev/sdb2" in command:
-            # sdb2 is a phantom atari partition that appears to have some valid bluestore info
-            return _bluestore_tool_label_output_sdb2(), '', 0
-        if "/dev/sde1" in command:
-            return _bluestore_tool_label_output_sde1(), '', 0
-        if "/dev/mapper/ceph--osd--block--1" in command:
-            # dm device 1 is a valid bluestore OSD (the other is corrupted/invalid)
-            return _bluestore_tool_label_output_dm_okay(), '', 0
-        # sda and children, sdb's children, sdc, sdd, dm device 2 all do NOT have bluestore OSD data
-        return [], 'fake No such file or directory error', 1
+        return ceph_bluestore_tool_show_label_output, '', 0
     pytest.fail('command {} needs behavior specified for it'.format(command))
 
 def _has_bluestore_label_side_effect(disk_path):
@@ -224,6 +122,7 @@ def _has_bluestore_label_side_effect(disk_path):
 
 class TestList(object):
 
+    @patch('ceph_volume.devices.raw.list.List.exclude_lvm_osd_devices', Mock())
     @patch('ceph_volume.util.device.disk.get_devices')
     @patch('ceph_volume.util.disk.has_bluestore_label')
     @patch('ceph_volume.process.call')
@@ -257,6 +156,7 @@ class TestList(object):
         assert sde1['ceph_fsid'] == 'sde1-fsid'
         assert sde1['type'] == 'bluestore'
 
+    @patch('ceph_volume.devices.raw.list.List.exclude_lvm_osd_devices', Mock())
     @patch('ceph_volume.util.device.disk.get_devices')
     @patch('ceph_volume.util.disk.has_bluestore_label')
     @patch('ceph_volume.process.call')
@@ -275,4 +175,4 @@ class TestList(object):
 
         result = raw.list.List([]).generate()
         assert len(result) == 2
-        assert 'sdb-uuid' in result
+        assert {'sdb-uuid', 'sde1-uuid'} == set(result.keys())
diff --git a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
index f4f50b06f8a..fd7c468037c 100644
--- a/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
+++ b/src/ceph-volume/ceph_volume/tests/objectstore/test_rawbluestore.py
@@ -159,6 +159,7 @@ class TestRawBlueStore:
 
     @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.rename_mapper', Mock(return_value=MagicMock()))
     @patch('ceph_volume.util.disk.get_bluestore_header')
+    @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_close', Mock(return_value=MagicMock()))
     @patch('ceph_volume.objectstore.rawbluestore.encryption_utils.luks_open', Mock(return_value=MagicMock()))
     def test_activate_dmcrypt_tpm(self, m_bs_header, rawbluestore, fake_lsblk_all, mock_raw_direct_report, is_root) -> None:
         m_bs_header.return_value = {
diff --git a/src/ceph-volume/ceph_volume/tests/test_inventory.py b/src/ceph-volume/ceph_volume/tests/test_inventory.py
index 785d8b56e86..29cd1fc4e4d 100644
--- a/src/ceph-volume/ceph_volume/tests/test_inventory.py
+++ b/src/ceph-volume/ceph_volume/tests/test_inventory.py
@@ -118,7 +118,7 @@ def device_data(device_info):
 class TestInventory(object):
 
     expected_keys = [
-        'ceph_device',
+        'ceph_device_lvm',
         'path',
         'rejected_reasons',
         'sys_api',
@@ -126,6 +126,7 @@ class TestInventory(object):
         'lvs',
         'device_id',
         'lsm_data',
+        'being_replaced'
     ]
 
     expected_sys_api_keys = [
diff --git a/src/ceph-volume/ceph_volume/tests/util/test_disk.py b/src/ceph-volume/ceph_volume/tests/util/test_disk.py
index 368c2ec8469..8c27ce402fb 100644
--- a/src/ceph-volume/ceph_volume/tests/util/test_disk.py
+++ b/src/ceph-volume/ceph_volume/tests/util/test_disk.py
@@ -1,4 +1,5 @@
 import pytest
+import stat
 from ceph_volume.util import disk
 from mock.mock import patch, Mock, MagicMock, mock_open
 from pyfakefs.fake_filesystem_unittest import TestCase
@@ -640,3 +641,107 @@ class TestBlockSysFs(TestCase):
         assert b.active_mappers()['dm-1']
         assert b.active_mappers()['dm-1']['type'] == 'LVM'
         assert b.active_mappers()['dm-1']['uuid'] == 'abcdef'
+
+
+class TestUdevData(TestCase):
+    def setUp(self) -> None:
+        udev_data_lv_device: str = """
+S:disk/by-id/dm-uuid-LVM-1f1RaxWlzQ61Sbc7oCIHRMdh0M8zRTSnU03ekuStqWuiA6eEDmwoGg3cWfFtE2li
+S:mapper/vg1-lv1
+S:disk/by-id/dm-name-vg1-lv1
+S:vg1/lv1
+I:837060642207
+E:DM_UDEV_DISABLE_OTHER_RULES_FLAG=
+E:DM_UDEV_DISABLE_LIBRARY_FALLBACK_FLAG=1
+E:DM_UDEV_PRIMARY_SOURCE_FLAG=1
+E:DM_UDEV_RULES_VSN=2
+E:DM_NAME=fake_vg1-fake-lv1
+E:DM_UUID=LVM-1f1RaxWlzQ61Sbc7oCIHRMdh0M8zRTSnU03ekuStqWuiA6eEDmwoGg3cWfFtE2li
+E:DM_SUSPENDED=0
+E:DM_VG_NAME=fake_vg1
+E:DM_LV_NAME=fake-lv1
+E:DM_LV_LAYER=
+E:NVME_HOST_IFACE=none
+E:SYSTEMD_READY=1
+G:systemd
+Q:systemd
+V:1"""
+        udev_data_bare_device: str = """
+S:disk/by-path/pci-0000:00:02.0
+S:disk/by-path/virtio-pci-0000:00:02.0
+S:disk/by-diskseq/1
+I:3037919
+E:ID_PATH=pci-0000:00:02.0
+E:ID_PATH_TAG=pci-0000_00_02_0
+E:ID_PART_TABLE_UUID=baefa409
+E:ID_PART_TABLE_TYPE=dos
+E:NVME_HOST_IFACE=none
+G:systemd
+Q:systemd
+V:1"""
+        self.fake_device: str = '/dev/cephtest'
+        self.setUpPyfakefs()
+        self.fs.create_file(self.fake_device, st_mode=(stat.S_IFBLK | 0o600))
+        self.fs.create_file('/run/udev/data/b999:0', create_missing_dirs=True, contents=udev_data_bare_device)
+        self.fs.create_file('/run/udev/data/b998:1', create_missing_dirs=True, contents=udev_data_lv_device)
+
+    def test_device_not_found(self) -> None:
+        self.fs.remove(self.fake_device)
+        with pytest.raises(RuntimeError):
+            disk.UdevData(self.fake_device)
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_no_data(self) -> None:
+        self.fs.remove('/run/udev/data/b999:0')
+        with pytest.raises(RuntimeError):
+            disk.UdevData(self.fake_device)
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_is_dm_false(self) -> None:
+        assert not disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_is_dm_true(self) -> None:
+        assert disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_is_lvm_true(self) -> None:
+        assert disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_is_lvm_false(self) -> None:
+        assert not disk.UdevData(self.fake_device).is_dm
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_slashed_path_with_lvm(self) -> None:
+        assert disk.UdevData(self.fake_device).slashed_path == '/dev/fake_vg1/fake-lv1'
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=1))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=998))
+    def test_dashed_path_with_lvm(self) -> None:
+        assert disk.UdevData(self.fake_device).dashed_path == '/dev/mapper/fake_vg1-fake-lv1'
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_slashed_path_with_bare_device(self) -> None:
+        assert disk.UdevData(self.fake_device).slashed_path == '/dev/cephtest'
+
+    @patch('ceph_volume.util.disk.os.stat', MagicMock())
+    @patch('ceph_volume.util.disk.os.minor', Mock(return_value=0))
+    @patch('ceph_volume.util.disk.os.major', Mock(return_value=999))
+    def test_dashed_path_with_bare_device(self) -> None:
+        assert disk.UdevData(self.fake_device).dashed_path == '/dev/cephtest'
+\ No newline at end of file
diff --git a/src/ceph-volume/ceph_volume/util/arg_validators.py b/src/ceph-volume/ceph_volume/util/arg_validators.py
index 99e7d039e74..e75b34e550e 100644
--- a/src/ceph-volume/ceph_volume/util/arg_validators.py
+++ b/src/ceph-volume/ceph_volume/util/arg_validators.py
@@ -7,6 +7,9 @@ from ceph_volume.util import disk
 from ceph_volume.util.encryption import set_dmcrypt_no_workqueue
 
 
+mlogger = terminal.MultiLogger(__name__)
+
+
 def valid_osd_id(val):
     return str(int(val))
 
@@ -70,6 +73,17 @@ class ValidZapDevice(ValidDevice):
         return self._device
 
 
+class ValidClearReplaceHeaderDevice(ValidDevice):
+    def __call__(self, dev_path: str) -> str:
+        super().get_device(dev_path)
+        return self._format_device(self._is_valid_device())
+
+    def _is_valid_device(self) -> Device:
+        if not self._device.is_being_replaced:
+            mlogger.info(f'{self.dev_path} has no replacement header.')
+        return self._device
+
+
 class ValidDataDevice(ValidDevice):
     def __call__(self, dev_path):
         super().get_device(dev_path)
diff --git a/src/ceph-volume/ceph_volume/util/device.py b/src/ceph-volume/ceph_volume/util/device.py
index 9c2c11e7f31..04eefeac750 100644
--- a/src/ceph-volume/ceph_volume/util/device.py
+++ b/src/ceph-volume/ceph_volume/util/device.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-
-
+# type: ignore
 import logging
 import os
 from functools import total_ordering
-from ceph_volume import sys_info, allow_loop_devices
+from ceph_volume import sys_info, allow_loop_devices, BEING_REPLACED_HEADER
 from ceph_volume.api import lvm
 from ceph_volume.util import disk, system
 from ceph_volume.util.lsmdisk import LSMDisk
 from ceph_volume.util.constants import ceph_disk_guids
+from typing import List, Tuple
 
 
 logger = logging.getLogger(__name__)
@@ -85,13 +86,14 @@ class Device(object):
      {attr:<25} {value}"""
 
     report_fields = [
-        'ceph_device',
+        'ceph_device_lvm',
         'rejected_reasons',
         'available',
         'path',
         'sys_api',
         'device_id',
         'lsm_data',
+        'being_replaced'
     ]
     pretty_report_sys_fields = [
         'actuators',
@@ -135,7 +137,8 @@ class Device(object):
         self.blkid_api = None
         self._exists = None
         self._is_lvm_member = None
-        self.ceph_device = False
+        self.ceph_device_lvm = False
+        self.being_replaced: bool = self.is_being_replaced
         self._parse()
         if self.path in sys_info.devices.keys():
             self.device_nodes = sys_info.devices[self.path]['device_nodes']
@@ -233,7 +236,7 @@ class Device(object):
             self.path = lv.lv_path
             self.vg_name = lv.vg_name
             self.lv_name = lv.name
-            self.ceph_device = lvm.is_ceph_device(lv)
+            self.ceph_device_lvm = lvm.is_ceph_device(lv)
         else:
             self.lvs = []
             if self.lsblk_all:
@@ -298,7 +301,7 @@ class Device(object):
             rot=self.rotational,
             available=self.available,
             model=self.model,
-            device_nodes=self.device_nodes
+            device_nodes=','.join(self.device_nodes)
         )
 
     def json_report(self):
@@ -363,7 +366,7 @@ class Device(object):
                     self._is_lvm_member = True
                     self.lvs.extend(lvm.get_device_lvs(path))
                 if self.lvs:
-                    self.ceph_device = any([True if lv.tags.get('ceph.osd_id') else False for lv in self.lvs])
+                    self.ceph_device_lvm = any([True if lv.tags.get('ceph.osd_id') else False for lv in self.lvs])
 
     def _get_partitions(self):
         """
@@ -590,7 +593,7 @@ class Device(object):
             return [vg_free]
 
     @property
-    def has_partitions(self):
+    def has_partitions(self) -> bool:
         '''
         Boolean to determine if a given device has partitions.
         '''
@@ -598,7 +601,14 @@ class Device(object):
             return True
         return False
 
-    def _check_generic_reject_reasons(self):
+    @property
+    def is_being_replaced(self) -> bool:
+        '''
+        Boolean to indicate if the device is being replaced.
+        '''
+        return disk._dd_read(self.path, 26) == BEING_REPLACED_HEADER
+
+    def _check_generic_reject_reasons(self) -> List[str]:
         reasons = [
             ('id_bus', 'usb', 'id_bus'),
             ('ro', '1', 'read-only'),
@@ -639,9 +649,11 @@ class Device(object):
             rejected.append('Has partitions')
         if self.has_fs:
             rejected.append('Has a FileSystem')
+        if self.is_being_replaced:
+            rejected.append('Is being replaced')
         return rejected
 
-    def _check_lvm_reject_reasons(self):
+    def _check_lvm_reject_reasons(self) -> Tuple[bool, List[str]]:
         rejected = []
         if self.vgs:
             available_vgs = [vg for vg in self.vgs if int(vg.vg_free_count) > 10]
@@ -654,7 +666,7 @@ class Device(object):
 
         return len(rejected) == 0, rejected
 
-    def _check_raw_reject_reasons(self):
+    def _check_raw_reject_reasons(self) -> Tuple[bool, List[str]]:
         rejected = self._check_generic_reject_reasons()
         if len(self.vgs) > 0:
             rejected.append('LVM detected')
diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index 8f89c4a2b7c..77b55314f66 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -7,7 +7,7 @@ import json
 from ceph_volume import process, allow_loop_devices
 from ceph_volume.api import lvm
 from ceph_volume.util.system import get_file_contents
-from typing import Dict, List, Any
+from typing import Dict, List, Any, Union, Optional
 
 
 logger = logging.getLogger(__name__)
@@ -251,7 +251,9 @@ def lsblk(device, columns=None, abspath=False):
 
     return result[0]
 
-def lsblk_all(device='', columns=None, abspath=False):
+def lsblk_all(device: str = '',
+              columns: Optional[List[str]] = None,
+              abspath: bool = False) -> List[Dict[str, str]]:
     """
     Create a dictionary of identifying values for a device using ``lsblk``.
     Each supported column is a key, in its *raw* format (all uppercase
@@ -332,7 +334,6 @@ def lsblk_all(device='', columns=None, abspath=False):
     if device:
         base_command.append('--nodeps')
         base_command.append(device)
-
     out, err, rc = process.call(base_command)
 
     if rc != 0:
@@ -771,9 +772,20 @@ def get_block_devs_sysfs(_sys_block_path: str = '/sys/block', _sys_dev_block_pat
         result.append([name, kname, "part", partitions[partition]])
     return sorted(result, key=lambda x: x[0])
 
-def get_partitions(_sys_dev_block_path ='/sys/dev/block') -> List[str]:
+def get_partitions(_sys_dev_block_path: str ='/sys/dev/block') -> Dict[str, str]:
+    """
+    Retrieves a dictionary mapping partition system names to their parent device names.
+
+    Args:
+        _sys_dev_block_path (str, optional): The path to the system's block device directory.
+                                             Defaults to '/sys/dev/block'.
+
+    Returns:
+        Dict[str, str]: A dictionary where the keys are partition system names, and the values are
+                        the corresponding parent device names.
+    """
     devices: List[str] = os.listdir(_sys_dev_block_path)
-    result: Dict[str, str] = dict()
+    result: Dict[str, str] = {}
     for device in devices:
         device_path: str = os.path.join(_sys_dev_block_path, device)
         is_partition: bool = int(get_file_contents(os.path.join(device_path, 'partition'), '0')) > 0
@@ -807,7 +819,7 @@ def get_devices(_sys_block_path='/sys/block', device=''):
     for block in block_devs:
         metadata: Dict[str, Any] = {}
         if block[2] == 'lvm':
-            block[1] = lvm.get_lv_path_from_mapper(block[1])
+            block[1] = UdevData(block[1]).slashed_path
         devname = os.path.basename(block[0])
         diskname = block[1]
         if block[2] not in block_types:
@@ -846,13 +858,14 @@ def get_devices(_sys_block_path='/sys/block', device=''):
             device_slaves = os.listdir(os.path.join(sysdir, 'slaves'))
             metadata['partitions'] = get_partitions_facts(sysdir)
 
+        metadata['device_nodes'] = []
         if device_slaves:
-            metadata['device_nodes'] = ','.join(device_slaves)
+            metadata['device_nodes'].extend(device_slaves)
         else:
             if block[2] == 'part':
-                metadata['device_nodes'] = block[3]
+                metadata['device_nodes'].append(block[3])
             else:
-                metadata['device_nodes'] = devname
+                metadata['device_nodes'].append(devname)
 
         metadata['actuators'] = None
         if os.path.isdir(sysdir + "/queue/independent_access_ranges/"):
@@ -968,7 +981,7 @@ def _dd_read(device: str, count: int, skip: int = 0) -> str:
 
     return result
 
-def _dd_write(device: str, data: str, skip: int = 0) -> None:
+def _dd_write(device: str, data: Union[str, bytes], skip: int = 0) -> None:
     """Write bytes to a device
 
     Args:
@@ -980,10 +993,14 @@ def _dd_write(device: str, data: str, skip: int = 0) -> None:
         OSError: If there is an error opening or writing to the device.
         Exception: If any other error occurs during the write operation.
     """
+
+    if isinstance(data, str):
+        data = data.encode('utf-8')
+
     try:
         with open(device, 'r+b') as b:
             b.seek(skip)
-            b.write(data.encode('utf-8'))
+            b.write(data)
     except OSError:
         logger.warning(f"Can't write to {device}")
         raise
@@ -1120,10 +1137,8 @@ def get_parent_device_from_mapper(mapper: str, abspath: bool = True) -> str:
             pass
     return result
 
-
 def get_lvm_mapper_path_from_dm(path: str, sys_block: str = '/sys/block') -> str:
-    """_summary_
-    Retrieve the logical volume path for a given device.
+    """Retrieve the logical volume path for a given device.
 
     This function takes the path of a device and returns the corresponding
     logical volume path by reading the 'dm/name' file within the sysfs
@@ -1134,7 +1149,7 @@ def get_lvm_mapper_path_from_dm(path: str, sys_block: str = '/sys/block') -> str
         sys_block (str, optional): The base sysfs block directory. Defaults to '/sys/block'.
 
     Returns:
-        str: The device mapper path in the form of '/dev/dm-X'.
+        str: The device mapper path in the 'dashed form' of '/dev/mapper/vg-lv'.
     """
     result: str = ''
     dev: str = os.path.basename(path)
@@ -1252,4 +1267,130 @@ class BlockSysFs:
                         result[holder]['dmcrypt_mapping'] = content_split[3]
                     if mapper_type == 'LVM':
                         result[holder]['uuid'] = content_split[1]
-        return result
-\ No newline at end of file
+        return result
+
+class UdevData:
+    """
+    Class representing udev data for a specific device.
+    This class extracts and stores relevant information about the device from udev files.
+
+    Attributes:
+    -----------
+    path : str
+        The initial device path (e.g., /dev/sda).
+    realpath : str
+        The resolved real path of the device.
+    stats : os.stat_result
+        The result of the os.stat() call to retrieve device metadata.
+    major : int
+        The device's major number.
+    minor : int
+        The device's minor number.
+    udev_data_path : str
+        The path to the udev metadata for the device (e.g., /run/udev/data/b<major>:<minor>).
+    symlinks : List[str]
+        A list of symbolic links pointing to the device.
+    id : str
+        A unique identifier for the device.
+    environment : Dict[str, str]
+        A dictionary containing environment variables extracted from the udev data.
+    group : str
+        The group associated with the device.
+    queue : str
+        The queue associated with the device.
+    version : str
+        The version of the device or its metadata.
+    """
+    def __init__(self, path: str) -> None:
+        """Initialize an instance of the UdevData class and load udev information.
+
+        Args:
+            path (str): The path to the device to be analyzed (e.g., /dev/sda).
+
+        Raises:
+            RuntimeError: Raised if no udev data file is found for the specified device.
+        """
+        if not os.path.exists(path):
+            raise RuntimeError(f'{path} not found.')
+        self.path: str = path
+        self.realpath: str = os.path.realpath(self.path)
+        self.stats: os.stat_result = os.stat(self.realpath)
+        self.major: int = os.major(self.stats.st_rdev)
+        self.minor: int = os.minor(self.stats.st_rdev)
+        self.udev_data_path: str = f'/run/udev/data/b{self.major}:{self.minor}'
+        self.symlinks: List[str] = []
+        self.id: str = ''
+        self.environment: Dict[str, str] = {}
+        self.group: str = ''
+        self.queue: str = ''
+        self.version: str = ''
+
+        if not os.path.exists(self.udev_data_path):
+            raise RuntimeError(f'No udev data could be retrieved for {self.path}')
+
+        with open(self.udev_data_path, 'r') as f:
+            content: str = f.read().strip()
+            self.raw_data: List[str] = content.split('\n')
+
+        for line in self.raw_data:
+            data_type, data = line.split(':', 1)
+            if data_type == 'S':
+                self.symlinks.append(data)
+            if data_type == 'I':
+                self.id = data
+            if data_type == 'E':
+                key, value = data.split('=')
+                self.environment[key] = value
+            if data_type == 'G':
+                self.group = data
+            if data_type == 'Q':
+                self.queue = data
+            if data_type == 'V':
+                self.version = data
+
+    @property
+    def is_dm(self) -> bool:
+        """Check if the device is a device mapper (DM).
+
+        Returns:
+            bool: True if the device is a device mapper, otherwise False.
+        """
+        return 'DM_UUID' in self.environment.keys()
+
+    @property
+    def is_lvm(self) -> bool:
+        """Check if the device is a Logical Volume Manager (LVM) volume.
+
+        Returns:
+            bool: True if the device is an LVM volume, otherwise False.
+        """
+        return self.environment.get('DM_UUID', '').startswith('LVM')
+
+    @property
+    def slashed_path(self) -> str:
+        """Get the LVM path structured with slashes.
+
+        Returns:
+            str: A path using slashes if the device is an LVM volume (e.g., /dev/vgname/lvname),
+                 otherwise the original path.
+        """
+        result: str = self.path
+        if self.is_lvm:
+            vg: str = self.environment.get('DM_VG_NAME', '')
+            lv: str = self.environment.get('DM_LV_NAME', '')
+            result = f'/dev/{vg}/{lv}'
+        return result
+
+    @property
+    def dashed_path(self) -> str:
+        """Get the LVM path structured with dashes.
+
+        Returns:
+            str: A path using dashes if the device is an LVM volume (e.g., /dev/mapper/vgname-lvname),
+            otherwise the original path.
+        """
+        result: str = self.path
+        if self.is_lvm:
+            name: str = self.environment.get('DM_NAME', '')
+            result = f'/dev/mapper/{name}'
+        return result
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index c0bd5b33ad4..52988843c83 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -375,8 +375,9 @@ int main(int argc, const char **argv)
 	    << " for osd." << whoami
 	    << " fsid " << g_conf().get_val<uuid_d>("fsid")
 	    << dendl;
+    forker.exit(0);
   }
-  if (mkfs || mkkey) {
+  if (mkkey) {
     forker.exit(0);
   }
   if (mkjournal) {
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index 1ab98a0ac4f..e32e2bc49f3 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -29,6 +29,7 @@ from glob import glob
 from io import StringIO
 from threading import Thread, Event
 from pathlib import Path
+from configparser import ConfigParser
 
 from cephadmlib.constants import (
     # default images
@@ -142,6 +143,7 @@ from cephadmlib.container_types import (
     SidecarContainer,
     extract_uid_gid,
     is_container_running,
+    get_mgr_images,
 )
 from cephadmlib.decorators import (
     deprecated_command,
@@ -2954,7 +2956,7 @@ def command_bootstrap(ctx):
         mounts = {}
         mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro'
         try:
-            out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
+            out = cli(['orch', 'apply', '--continue-on-error', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
             logger.info(out)
         except Exception:
             ctx.error_code = -errno.EINVAL
@@ -4679,6 +4681,13 @@ def command_rescan_disks(ctx: CephadmContext) -> str:
     return f'Ok. {len(all_scan_files)} adapters detected: {len(scan_files)} rescanned, {len(skipped)} skipped, {len(failures)} failed ({elapsed:.2f}s)'
 
 
+def command_list_images(ctx: CephadmContext) -> None:
+    """this function will list the default images used by different services"""
+    cp_obj = ConfigParser()
+    cp_obj['mgr'] = get_mgr_images()
+    # print default images
+    cp_obj.write(sys.stdout)
+
 ##################################
 
 
@@ -5542,6 +5551,9 @@ def _get_parser():
         'disk-rescan', help='rescan all HBAs to detect new/removed devices')
     parser_disk_rescan.set_defaults(func=command_rescan_disks)
 
+    parser_list_images = subparsers.add_parser(
+        'list-images', help='list all the default images')
+    parser_list_images.set_defaults(func=command_list_images)
     return parser
 
 
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index d25eb1391e0..354c3782398 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -5,15 +5,15 @@ DEFAULT_IMAGE = 'quay.ceph.io/ceph-ci/ceph:main'
 DEFAULT_IMAGE_IS_MAIN = True
 DEFAULT_IMAGE_RELEASE = 'squid'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -22,7 +22,7 @@ DEFAULT_SMB_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
 DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
 DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
 DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
-DEFAULT_REGISTRY = 'docker.io'  # normalize unqualified digests to this
+DEFAULT_REGISTRY = 'quay.io'  # normalize unqualified digests to this
 # ------------------------------------------------------------------------------
 
 LATEST_STABLE_RELEASE = 'squid'
diff --git a/src/cephadm/cephadmlib/container_types.py b/src/cephadm/cephadmlib/container_types.py
index 665c4d89652..791a545538a 100644
--- a/src/cephadm/cephadmlib/container_types.py
+++ b/src/cephadm/cephadmlib/container_types.py
@@ -8,7 +8,28 @@ import os
 from typing import Dict, List, Optional, Any, Union, Tuple, Iterable, cast
 
 from .call_wrappers import call, call_throws, CallVerbosity
-from .constants import DEFAULT_TIMEOUT
+from .constants import (
+    DEFAULT_TIMEOUT,
+    # default container images
+    DEFAULT_ALERT_MANAGER_IMAGE,
+    DEFAULT_GRAFANA_IMAGE,
+    DEFAULT_LOKI_IMAGE,
+    DEFAULT_NODE_EXPORTER_IMAGE,
+    DEFAULT_PROMETHEUS_IMAGE,
+    DEFAULT_PROMTAIL_IMAGE,
+    DEFAULT_HAPROXY_IMAGE,
+    DEFAULT_KEEPALIVED_IMAGE,
+    DEFAULT_NVMEOF_IMAGE,
+    DEFAULT_SNMP_GATEWAY_IMAGE,
+    DEFAULT_ELASTICSEARCH_IMAGE,
+    DEFAULT_JAEGER_COLLECTOR_IMAGE,
+    DEFAULT_JAEGER_AGENT_IMAGE,
+    DEFAULT_JAEGER_QUERY_IMAGE,
+    DEFAULT_SMB_IMAGE,
+    DEFAULT_SMBMETRICS_IMAGE,
+    DEFAULT_NGINX_IMAGE,
+    DEFAULT_OAUTH2_PROXY_IMAGE,
+)
 from .container_engines import Docker, Podman
 from .context import CephadmContext
 from .daemon_identity import DaemonIdentity, DaemonSubIdentity
@@ -660,3 +681,30 @@ def enable_shared_namespaces(
     cc = f'container:{name}'
     for n in ns:
         _replace_container_arg(args, n.to_option(cc))
+
+
+def get_mgr_images() -> dict:
+    """Return dict of default mgr images"""
+    mgr_prefix = 'mgr/cephadm/container_image_'
+    mgr_images = {}
+    mgr_images[mgr_prefix + 'prometheus'] = DEFAULT_PROMETHEUS_IMAGE
+    mgr_images[mgr_prefix + 'alertmanager'] = DEFAULT_ALERT_MANAGER_IMAGE
+    mgr_images[mgr_prefix + 'graphana'] = DEFAULT_GRAFANA_IMAGE
+    mgr_images[mgr_prefix + 'loki'] = DEFAULT_LOKI_IMAGE
+    mgr_images[mgr_prefix + 'promtail'] = DEFAULT_PROMTAIL_IMAGE
+    mgr_images[mgr_prefix + 'node_exporter'] = DEFAULT_NODE_EXPORTER_IMAGE
+    mgr_images[mgr_prefix + 'haproxy'] = DEFAULT_HAPROXY_IMAGE
+    mgr_images[mgr_prefix + 'keepalived'] = DEFAULT_KEEPALIVED_IMAGE
+    mgr_images[mgr_prefix + 'nvmeof'] = DEFAULT_NVMEOF_IMAGE
+    mgr_images[mgr_prefix + 'snmp_gateway'] = DEFAULT_SNMP_GATEWAY_IMAGE
+    mgr_images[mgr_prefix + 'elasticsearch'] = DEFAULT_ELASTICSEARCH_IMAGE
+    mgr_images[
+        mgr_prefix + 'jaeger_collector'
+    ] = DEFAULT_JAEGER_COLLECTOR_IMAGE
+    mgr_images[mgr_prefix + 'jaeger_agent'] = DEFAULT_JAEGER_AGENT_IMAGE
+    mgr_images[mgr_prefix + 'jaeger_query'] = DEFAULT_JAEGER_QUERY_IMAGE
+    mgr_images[mgr_prefix + 'smb'] = DEFAULT_SMB_IMAGE
+    mgr_images[mgr_prefix + 'smbmetrics'] = DEFAULT_SMBMETRICS_IMAGE
+    mgr_images[mgr_prefix + 'nginx'] = DEFAULT_NGINX_IMAGE
+    mgr_images[mgr_prefix + 'oauth2_proxy'] = DEFAULT_OAUTH2_PROXY_IMAGE
+    return mgr_images
diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index ae9acbc9c45..82f886e72ec 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -72,6 +72,7 @@ class Config:
     instance_id: str
     source_config: str
     samba_debug_level: int
+    ctdb_log_level: str
     debug_delay: int
     domain_member: bool
     clustered: bool
@@ -98,6 +99,7 @@ class Config:
         domain_member: bool,
         clustered: bool,
         samba_debug_level: int = 0,
+        ctdb_log_level: str = '',
         debug_delay: int = 0,
         join_sources: Optional[List[str]] = None,
         user_sources: Optional[List[str]] = None,
@@ -119,6 +121,7 @@ class Config:
         self.domain_member = domain_member
         self.clustered = clustered
         self.samba_debug_level = samba_debug_level
+        self.ctdb_log_level = ctdb_log_level
         self.debug_delay = debug_delay
         self.join_sources = join_sources or []
         self.user_sources = user_sources or []
@@ -370,6 +373,8 @@ class CTDBDaemonContainer(SambaContainerCommon):
         # make conditional?
         # CAP_NET_ADMIN is needed for event script to add public ips to iface
         cargs.append('--cap-add=NET_ADMIN')
+        # CAP_NET_RAW allows to send gratuitous ARPs/tickle ACKs via raw sockets
+        cargs.append('--cap-add=NET_RAW')
         return cargs
 
 
@@ -714,6 +719,18 @@ class SMB(ContainerDaemonForm):
             mounts[ctdb_run] = '/var/run/ctdb:z'
             mounts[ctdb_volatile] = '/var/lib/ctdb/volatile:z'
             mounts[ctdb_etc] = '/etc/ctdb:z'
+            # create a shared smb.conf file for our clustered instances.
+            # This is a HACK that substitutes for a bunch of architectural
+            # changes to sambacc *and* smbmetrics (container). In short,
+            # sambacc can set up the correct cluster enabled conf file for
+            # samba daemons (smbd, winbindd, etc) but not it's own long running
+            # tasks.  Similarly, the smbmetrics container always uses the
+            # registry conf (non-clustered). Having cephadm create a stub
+            # config that will share the file across all containers is a
+            # stopgap that resolves the problem for now, but should eventually
+            # be replaced by a less "leaky" approach in the managed containers.
+            ctdb_smb_conf = str(data_dir / 'ctdb/smb.conf')
+            mounts[ctdb_smb_conf] = '/etc/samba/smb.conf:z'
 
     def customize_container_endpoints(
         self, endpoints: List[EndPoint], deployment_type: DeploymentType
@@ -739,11 +756,12 @@ class SMB(ContainerDaemonForm):
             file_utils.makedirs(ddir / 'ctdb/volatile', uid, gid, 0o770)
             file_utils.makedirs(ddir / 'ctdb/etc', uid, gid, 0o770)
             self._write_ctdb_stub_config(etc_samba_ctr / 'ctdb.json')
+            self._write_smb_conf_stub(ddir / 'ctdb/smb.conf')
 
     def _write_ctdb_stub_config(self, path: pathlib.Path) -> None:
         reclock_cmd = ' '.join(_MUTEX_SUBCMD + [self._cfg.cluster_lock_uri])
         nodes_cmd = ' '.join(_NODES_SUBCMD)
-        stub_config = {
+        stub_config: Dict[str, Any] = {
             'samba-container-config': 'v0',
             'ctdb': {
                 # recovery_lock is passed directly to ctdb: needs '!' prefix
@@ -755,9 +773,24 @@ class SMB(ContainerDaemonForm):
                 ),
             },
         }
+        if self._cfg.ctdb_log_level:
+            stub_config['ctdb']['log_level'] = self._cfg.ctdb_log_level
         with file_utils.write_new(path) as fh:
             json.dump(stub_config, fh)
 
+    def _write_smb_conf_stub(self, path: pathlib.Path) -> None:
+        """Initialize a stub smb conf that will be shared by the primary
+        and sidecar containers. This is expected to be overwritten by
+        sambacc.
+        """
+        _lines = [
+            '[global]',
+            'config backend = registry',
+        ]
+        with file_utils.write_new(path) as fh:
+            for line in _lines:
+                fh.write(f'{line}\n')
+
 
 class _NetworkMapper:
     """Helper class that maps between cephadm-friendly address-networks
diff --git a/src/cephadm/cephadmlib/data_utils.py b/src/cephadm/cephadmlib/data_utils.py
index 2f4674752cc..0ab8b38d2b5 100644
--- a/src/cephadm/cephadmlib/data_utils.py
+++ b/src/cephadm/cephadmlib/data_utils.py
@@ -165,17 +165,17 @@ def is_fsid(s):
 def normalize_image_digest(digest: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/ubuntu', 'quay.io')
+    'quay.io/ubuntu'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/cephadm/samples/custom_container.json b/src/cephadm/samples/custom_container.json
index 194a44d2abb..210cf1e3e55 100644
--- a/src/cephadm/samples/custom_container.json
+++ b/src/cephadm/samples/custom_container.json
@@ -1,5 +1,5 @@
 {
-    "image": "docker.io/prom/alertmanager:v0.20.0",
+    "image": "quay.io/prometheus/alertmanager:v0.20.0",
     "ports": [9093, 9094],
     "args": [
         "-p", "9093:9093",
diff --git a/src/cephadm/tests/build/test_cephadm_build.py b/src/cephadm/tests/build/test_cephadm_build.py
index 1465c2c5efe..c2995a76d4b 100644
--- a/src/cephadm/tests/build/test_cephadm_build.py
+++ b/src/cephadm/tests/build/test_cephadm_build.py
@@ -34,12 +34,12 @@ CONTAINERS = {
     },
     'ubuntu-20.04': {
         'name': 'cephadm-build-test:ubuntu-20-04-py3',
-        'base_image': 'docker.io/library/ubuntu:20.04',
+        'base_image': 'quay.io/library/ubuntu:20.04',
         'script': 'apt update && apt install -y python3-venv',
     },
     'ubuntu-22.04': {
         'name': 'cephadm-build-test:ubuntu-22-04-py3',
-        'base_image': 'docker.io/library/ubuntu:22.04',
+        'base_image': 'quay.io/library/ubuntu:22.04',
         'script': 'apt update && apt install -y python3-venv',
     },
 }
diff --git a/src/cephadm/tests/test_cephadm.py b/src/cephadm/tests/test_cephadm.py
index 928982de70b..f27b9bcd362 100644
--- a/src/cephadm/tests/test_cephadm.py
+++ b/src/cephadm/tests/test_cephadm.py
@@ -533,12 +533,12 @@ class TestCephAdm(object):
 
     def test_get_image_info_from_inspect(self):
         # podman
-        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
+        out = """204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1,[quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/ceph/ceph:latest')
         print(r)
         assert r == {
             'image_id': '204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1',
-            'repo_digests': ['docker.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
+            'repo_digests': ['quay.io/ceph/ceph@sha256:1cc9b824e1b076cdff52a9aa3f0cc8557d879fb2fbbba0cafed970aca59a3992']
         }
 
         # docker
@@ -550,13 +550,13 @@ class TestCephAdm(object):
         }
 
         # multiple digests (podman)
-        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
+        out = """e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42,[quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4 quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a]"""
         r = _cephadm.get_image_info_from_inspect(out, 'registry/prom/prometheus:latest')
         assert r == {
             'image_id': 'e935122ab143a64d92ed1fbb27d030cf6e2f0258207be1baf1b509c466aeeb42',
             'repo_digests': [
-                'docker.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
-                'docker.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
+                'quay.io/prom/prometheus@sha256:e4ca62c0d62f3e886e684806dfe9d4e0cda60d54986898173c1083856cfda0f4',
+                'quay.io/prom/prometheus@sha256:efd99a6be65885c07c559679a0df4ec709604bcdd8cd83f0d00a1a683b28fb6a',
             ]
         }
 
@@ -604,7 +604,7 @@ class TestCephAdm(object):
                                  '')
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=cinfo):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -613,7 +613,7 @@ class TestCephAdm(object):
         # make sure first valid image is used when no container_info is found
         out = '''quay.ceph.io/ceph-ci/ceph@sha256:87f200536bb887b36b959e887d5984dd7a3f008a23aa1f283ab55d48b22c6185|dad864ee21e9|main|2022-03-23 16:29:19 +0000 UTC
         quay.ceph.io/ceph-ci/ceph@sha256:b50b130fcda2a19f8507ddde3435bb4722266956e1858ac395c838bc1dcf1c0e|514e6a882f6e|pacific|2022-03-23 15:58:34 +0000 UTC
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
@@ -621,12 +621,12 @@ class TestCephAdm(object):
 
         # make sure images without digest are discarded (no container_info is found)
         out = '''quay.ceph.io/ceph-ci/ceph@|||
-        docker.io/ceph/ceph@|||
-        docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
+        quay.io/ceph/ceph@|||
+        quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508|666bbfa87e8d|v15.2.5|2020-09-16 14:15:15 +0000 UTC'''
         with mock.patch('cephadm.call_throws', return_value=(out, '', '')):
             with mock.patch('cephadm.get_container_info', return_value=None):
                 image = _cephadm.infer_local_ceph_image(ctx, ctx.container_engine)
-                assert image == 'docker.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
+                assert image == 'quay.io/ceph/ceph@sha256:939a46c06b334e094901560c8346de33c00309e3e3968a2db240eb4897c6a508'
 
 
 
@@ -2409,7 +2409,7 @@ class TestSNMPGateway:
 
     def test_unit_run_V2c(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V2c_config)
             ctx.fsid = fsid
@@ -2434,11 +2434,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V2c --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl')
 
     def test_unit_run_V3_noPriv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_no_priv_config)
             ctx.fsid = fsid
@@ -2463,11 +2463,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9465 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000')
 
     def test_unit_run_V3_Priv(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.V3_priv_config)
             ctx.fsid = fsid
@@ -2492,11 +2492,11 @@ class TestSNMPGateway:
             )
             with open(f'/var/lib/ceph/{fsid}/snmp-gateway.daemon_id/unit.run', 'r') as f:
                 run_cmd = f.readlines()[-1].rstrip()
-                assert run_cmd.endswith('docker.io/maxwo/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
+                assert run_cmd.endswith('quay.io/ceph/snmp-notifier:v1.2.1 --web.listen-address=:9464 --snmp.destination=192.168.1.10:162 --snmp.version=V3 --log.level=info --snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl --snmp.authentication-enabled --snmp.authentication-protocol=SHA --snmp.security-engine-id=8000C53F00000000 --snmp.private-enabled --snmp.private-protocol=DES')
 
     def test_unit_run_no_dest(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.no_destination_config)
             ctx.fsid = fsid
@@ -2512,7 +2512,7 @@ class TestSNMPGateway:
 
     def test_unit_run_bad_version(self, cephadm_fs):
         fsid = 'ca734440-3dc6-11ec-9b98-5254002537a6'
-        with with_cephadm_ctx(['--image=docker.io/maxwo/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
+        with with_cephadm_ctx(['--image=quay.io/ceph/snmp-notifier:v1.2.1'], list_networks={}) as ctx:
             import json
             ctx.config_json = json.dumps(self.bad_version_config)
             ctx.fsid = fsid
diff --git a/src/cephadm/tests/test_custom_container.py b/src/cephadm/tests/test_custom_container.py
index c185b0908df..197ed38dca3 100644
--- a/src/cephadm/tests/test_custom_container.py
+++ b/src/cephadm/tests/test_custom_container.py
@@ -47,7 +47,7 @@ class TestCustomContainer(unittest.TestCase):
                     ]
                 ]
             },
-            image='docker.io/library/hello-world:latest'
+            image='quay.io/hello-world/hello-world:latest'
         )
 
     def test_entrypoint(self):
diff --git a/src/cephadm/tox.ini b/src/cephadm/tox.ini
index 70e9a411238..20608c1681c 100644
--- a/src/cephadm/tox.ini
+++ b/src/cephadm/tox.ini
@@ -49,7 +49,8 @@ deps =
     flake8-quotes
 commands =
     flake8 --config=tox.ini {posargs:cephadm.py cephadmlib}
-    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 11'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "docker.io" | wc -l) == 1'
+    bash -c 'test $(git ls-files 'cephadm.py' 'cephadmlib/*.py' | sort -u | xargs grep "quay.io" | wc -l) == 25'
 # Downstream distributions may choose to alter this "docker.io" number,
 # to make sure no new references to docker.io are creeping in unnoticed.
 
diff --git a/src/client/Client.cc b/src/client/Client.cc
index e208cf76675..f687264e167 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -3646,6 +3646,9 @@ void Client::put_cap_ref(Inode *in, int cap)
     if (last & CEPH_CAP_FILE_CACHE) {
       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
       ++put_nref;
+
+      ldout(cct, 10) << __func__ << " calling signal_caps_inode" << dendl;
+      signal_caps_inode(in);
     }
     if (drop)
       check_caps(in, 0);
@@ -6125,6 +6128,10 @@ int Client::may_open(Inode *in, int flags, const UserPerm& perms)
   int r = 0;
   switch (in->mode & S_IFMT) {
     case S_IFLNK:
+#if defined(__linux__) && defined(O_PATH)
+      if (flags & O_PATH)
+        break;
+#endif
       r = -CEPHFS_ELOOP;
       goto out;
     case S_IFDIR:
@@ -7953,6 +7960,12 @@ int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, c
     return r;
   }
 
+  if (!strcmp(relpath, "")) {
+    if (!dirinode.get()->is_symlink())
+      return -CEPHFS_ENOENT;
+    return _readlink(dirinode.get(), buf, size);
+  }
+
   InodeRef in;
   filepath path(relpath);
   r = path_walk(path, &in, perms, false, 0, dirinode);
@@ -10798,7 +10811,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
         goto success;
     }
 
-    clnt->put_cap_ref(in, CEPH_CAP_FILE_RD);
     // reverify size
     {
       r = clnt->_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
@@ -10810,14 +10822,6 @@ void Client::C_Read_Sync_NonBlocking::finish(int r)
     if ((uint64_t)pos >= in->size)
       goto success;
 
-    {
-      int have_caps2 = 0;
-      r = clnt->get_caps(f, CEPH_CAP_FILE_RD, have_caps, &have_caps2, -1);
-      if (r < 0) {
-        goto error;
-      }
-    }
-
     wanted = left;
     retry();
     clnt->client_lock.unlock();
@@ -10971,6 +10975,20 @@ retry:
     // branch below but in a non-blocking fashion. The code in _read_sync
     // is duplicated and modified and exists in
     // C_Read_Sync_NonBlocking::finish().
+
+    // trim read based on file size?
+    if ((offset >= in->size) || (size == 0)) {
+      // read is requested at the EOF or the read len is zero, therefore just
+      // release managed pointers and complete the C_Read_Finisher immediately with 0 bytes
+
+      Context *iof = iofinish.release();
+      crf.release();
+      iof->complete(0);
+
+      // Signal async completion
+      return 0;
+    }
+
     C_Read_Sync_NonBlocking *crsa =
       new C_Read_Sync_NonBlocking(this, iofinish.release(), f, in, f->pos,
                                   offset, size, bl, filer.get(), have);
@@ -11399,10 +11417,18 @@ int64_t Client::_write_success(Fh *f, utime_t start, uint64_t fpos,
   return r;
 }
 
+void Client::C_Lock_Client_Finisher::finish(int r)
+{
+  std::scoped_lock lock(clnt->client_lock);
+  onfinish->complete(r);
+}
+
 void Client::C_Write_Finisher::finish_io(int r)
 {
   bool fini;
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   clnt->put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
 
   if (r >= 0) {
@@ -11438,6 +11464,8 @@ void Client::C_Write_Finisher::finish_fsync(int r)
   bool fini;
   client_t const whoami = clnt->whoami;  // For the benefit of ldout prefix
 
+  ceph_assert(ceph_mutex_is_locked_by_me(clnt->client_lock));
+
   ldout(clnt->cct, 3) << "finish_fsync r = " << r << dendl;
 
   fsync_finished = true;
@@ -11598,6 +11626,7 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
 
   std::unique_ptr<Context> iofinish = nullptr;
   std::unique_ptr<C_Write_Finisher> cwf = nullptr;
+  std::unique_ptr<Context> filer_iofinish = nullptr;
   
   if (in->inline_version < CEPH_INLINE_NONE) {
     if (endoff > cct->_conf->client_max_inline_size ||
@@ -11709,7 +11738,10 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     if (onfinish == nullptr) {
       // We need a safer condition to wait on.
       cond_iofinish = new C_SaferCond();
-      iofinish.reset(cond_iofinish);
+      filer_iofinish.reset(cond_iofinish);
+    } else {
+      //Register a wrapper callback for the C_Write_Finisher which takes 'client_lock'
+      filer_iofinish.reset(new C_Lock_Client_Finisher(this, iofinish.get()));
     }
 
     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
@@ -11717,11 +11749,12 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
 		       offset, size, bl, ceph::real_clock::now(), 0,
 		       in->truncate_size, in->truncate_seq,
-		       iofinish.get());
+		       filer_iofinish.get());
 
     if (onfinish) {
       // handle non-blocking caller (onfinish != nullptr), we can now safely
       // release all the managed pointers
+      filer_iofinish.release();
       iofinish.release();
       onuninline.release();
       cwf.release();
diff --git a/src/client/Client.h b/src/client/Client.h
index 5a1e69394d0..f8c39e2fdd6 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -1409,6 +1409,21 @@ private:
     void finish(int r) override;
   };
 
+  // A wrapper callback which takes the 'client_lock' and finishes the context.
+  // One of the usecase is the filer->write_trunc which doesn't hold client_lock
+  // in the call back passed. So, use this wrapper in such cases.
+  class C_Lock_Client_Finisher : public Context {
+  public:
+    C_Lock_Client_Finisher(Client *clnt, Context *onfinish)
+      : clnt(clnt), onfinish(onfinish) {}
+
+  private:
+    Client *clnt;
+    Context *onfinish;
+
+    void finish(int r) override;
+  };
+
   class C_Write_Finisher : public Context {
   public:
     void finish_io(int r);
diff --git a/src/cls/user/cls_user.cc b/src/cls/user/cls_user.cc
index 0447bf33a2c..592f304fc71 100644
--- a/src/cls/user/cls_user.cc
+++ b/src/cls/user/cls_user.cc
@@ -482,10 +482,6 @@ static int cls_user_reset_stats2(cls_method_context_t hctx,
     add_header_stats(&ret.acc_stats, e);
   }
 
-  /* try-update marker */
-  if(!keys.empty())
-    ret.marker = (--keys.cend())->first;
-
   if (! ret.truncated) {
     buffer::list bl;
     header.last_stats_update = op.time;
@@ -500,6 +496,10 @@ static int cls_user_reset_stats2(cls_method_context_t hctx,
     return rc;
   }
 
+  /* try-update marker */
+  if(!keys.empty())
+    ret.marker = (--keys.cend())->first;
+
   /* return partial result */
   encode(ret, *out);
   return 0;
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 8b9f3339e38..ea3cce16609 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -12,6 +12,7 @@ if(WIN32)
   add_library(dlfcn_win32 STATIC win32/dlfcn.cc win32/errno.cc)
 endif()
 
+add_subdirectory(io_exerciser)
 add_subdirectory(options)
 
 set(common_srcs
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
index ff931faffc1..43550f35197 100644
--- a/src/common/Finisher.cc
+++ b/src/common/Finisher.cc
@@ -2,11 +2,40 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "Finisher.h"
+#include "common/perf_counters.h"
+
+#include <fmt/core.h>
 
 #define dout_subsys ceph_subsys_finisher
 #undef dout_prefix
 #define dout_prefix *_dout << "finisher(" << this << ") "
 
+Finisher::Finisher(CephContext *cct_) :
+  cct(cct_), finisher_lock(ceph::make_mutex("Finisher::finisher_lock")),
+  thread_name("fn_anonymous"),
+  finisher_thread(this) {}
+
+Finisher::Finisher(CephContext *cct_, std::string_view name, std::string &&tn) :
+  cct(cct_), finisher_lock(ceph::make_mutex(fmt::format("Finisher::{}", name))),
+  thread_name(std::move(tn)),
+  finisher_thread(this) {
+  PerfCountersBuilder b(cct, fmt::format("finisher-{}", name),
+			l_finisher_first, l_finisher_last);
+  b.add_u64(l_finisher_queue_len, "queue_len");
+  b.add_time_avg(l_finisher_complete_lat, "complete_latency");
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+  logger->set(l_finisher_queue_len, 0);
+  logger->set(l_finisher_complete_lat, 0);
+}
+
+Finisher::~Finisher() {
+  if (logger && cct) {
+    cct->get_perfcounters_collection()->remove(logger);
+    delete logger;
+  }
+}
+
 void Finisher::start()
 {
   ldout(cct, 10) << __func__ << dendl;
@@ -20,7 +49,7 @@ void Finisher::stop()
   finisher_stop = true;
   // we don't have any new work to do, but we want the worker to wake up anyway
   // to process the stop condition.
-  finisher_cond.notify_all();
+  finisher_cond.notify_one();
   finisher_lock.unlock();
   finisher_thread.join(); // wait until the worker exits completely
   ldout(cct, 10) << __func__ << " finish" << dendl;
@@ -40,7 +69,7 @@ void Finisher::wait_for_empty()
 
 bool Finisher::is_empty()
 {
-  std::unique_lock ul(finisher_lock);
+  const std::lock_guard l{finisher_lock};
   return finisher_queue.empty();
 }
 
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 9091d0b892a..acee6594ca4 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -19,10 +19,8 @@
 #include "include/common_fwd.h"
 #include "common/Thread.h"
 #include "common/ceph_mutex.h"
-#include "common/perf_counters.h"
 #include "common/Cond.h"
 
-
 /// Finisher queue length performance counter ID.
 enum {
   l_finisher_first = 997082,
@@ -37,23 +35,23 @@ enum {
  * contexts to complete is thread-safe.
  */
 class Finisher {
-  CephContext *cct;
+  CephContext *const cct;
   ceph::mutex finisher_lock; ///< Protects access to queues and finisher_running.
   ceph::condition_variable finisher_cond; ///< Signaled when there is something to process.
   ceph::condition_variable finisher_empty_cond; ///< Signaled when the finisher has nothing more to process.
-  bool         finisher_stop; ///< Set when the finisher should stop.
-  bool         finisher_running; ///< True when the finisher is currently executing contexts.
-  bool	       finisher_empty_wait; ///< True mean someone wait finisher empty.
+  bool         finisher_stop = false; ///< Set when the finisher should stop.
+  bool         finisher_running = false; ///< True when the finisher is currently executing contexts.
+  bool	       finisher_empty_wait = false; ///< True mean someone wait finisher empty.
 
   /// Queue for contexts for which complete(0) will be called.
   std::vector<std::pair<Context*,int>> finisher_queue;
   std::vector<std::pair<Context*,int>> in_progress_queue;
 
-  std::string thread_name;
+  const std::string thread_name;
 
   /// Performance counter for the finisher's queue length.
   /// Only active for named finishers.
-  PerfCounters *logger;
+  PerfCounters *logger = nullptr;
 
   void *finisher_thread_entry();
 
@@ -66,56 +64,34 @@ class Finisher {
  public:
   /// Add a context to complete, optionally specifying a parameter for the complete function.
   void queue(Context *c, int r = 0) {
-    std::unique_lock ul(finisher_lock);
-    bool was_empty = finisher_queue.empty();
-    finisher_queue.push_back(std::make_pair(c, r));
-    if (was_empty) {
-      finisher_cond.notify_one();
+    {
+      const std::lock_guard l{finisher_lock};
+      const bool should_notify = finisher_queue.empty() && !finisher_running;
+      finisher_queue.push_back(std::make_pair(c, r));
+      if (should_notify) {
+	finisher_cond.notify_one();
+      }
     }
+
     if (logger)
       logger->inc(l_finisher_queue_len);
   }
 
-  void queue(std::list<Context*>& ls) {
+  // TODO use C++20 concept checks instead of SFINAE
+  template<typename T>
+  auto queue(T &ls) -> decltype(std::distance(ls.begin(), ls.end()), void()) {
     {
-      std::unique_lock ul(finisher_lock);
-      if (finisher_queue.empty()) {
-	finisher_cond.notify_all();
-      }
-      for (auto i : ls) {
-	finisher_queue.push_back(std::make_pair(i, 0));
-      }
-      if (logger)
-	logger->inc(l_finisher_queue_len, ls.size());
-    }
-    ls.clear();
-  }
-  void queue(std::deque<Context*>& ls) {
-    {
-      std::unique_lock ul(finisher_lock);
-      if (finisher_queue.empty()) {
-	finisher_cond.notify_all();
-      }
-      for (auto i : ls) {
+      const std::lock_guard l{finisher_lock};
+      const bool should_notify = finisher_queue.empty() && !finisher_running;
+      for (Context *i : ls) {
 	finisher_queue.push_back(std::make_pair(i, 0));
       }
-      if (logger)
-	logger->inc(l_finisher_queue_len, ls.size());
-    }
-    ls.clear();
-  }
-  void queue(std::vector<Context*>& ls) {
-    {
-      std::unique_lock ul(finisher_lock);
-      if (finisher_queue.empty()) {
-	finisher_cond.notify_all();
+      if (should_notify) {
+	finisher_cond.notify_one();
       }
-      for (auto i : ls) {
-	finisher_queue.push_back(std::make_pair(i, 0));
-      }
-      if (logger)
-	logger->inc(l_finisher_queue_len, ls.size());
     }
+    if (logger)
+      logger->inc(l_finisher_queue_len, ls.size());
     ls.clear();
   }
 
@@ -137,36 +113,17 @@ class Finisher {
 
   bool is_empty();
 
+  std::string_view get_thread_name() const noexcept {
+    return thread_name;
+  }
+
   /// Construct an anonymous Finisher.
   /// Anonymous finishers do not log their queue length.
-  explicit Finisher(CephContext *cct_) :
-    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::finisher_lock")),
-    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
-    thread_name("fn_anonymous"), logger(0),
-    finisher_thread(this) {}
+  explicit Finisher(CephContext *cct_);
 
   /// Construct a named Finisher that logs its queue length.
-  Finisher(CephContext *cct_, std::string name, std::string tn) :
-    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::" + name)),
-    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
-    thread_name(tn), logger(0),
-    finisher_thread(this) {
-    PerfCountersBuilder b(cct, std::string("finisher-") + name,
-			  l_finisher_first, l_finisher_last);
-    b.add_u64(l_finisher_queue_len, "queue_len");
-    b.add_time_avg(l_finisher_complete_lat, "complete_latency");
-    logger = b.create_perf_counters();
-    cct->get_perfcounters_collection()->add(logger);
-    logger->set(l_finisher_queue_len, 0);
-    logger->set(l_finisher_complete_lat, 0);
-  }
-
-  ~Finisher() {
-    if (logger && cct) {
-      cct->get_perfcounters_collection()->remove(logger);
-      delete logger;
-    }
-  }
+  Finisher(CephContext *cct_, std::string_view name, std::string &&tn);
+  ~Finisher();
 };
 
 /// Context that is completed asynchronously on the supplied finisher.
diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc
index f68f87f4645..fd3b2be0221 100644
--- a/src/common/Formatter.cc
+++ b/src/common/Formatter.cc
@@ -296,6 +296,17 @@ void JSONFormatter::finish_pending_string()
   }
 }
 
+void JSONFormatter::add_value(std::string_view name, double val) {
+  CachedStackStringStream css;
+  if (!std::isfinite(val) || std::isnan(val)) {
+    *css << "null";
+  } else {
+    css->precision(std::numeric_limits<double>::max_digits10);
+    *css << val;
+  }
+  add_value(name, css->strv(), false);
+}
+
 template <class T>
 void JSONFormatter::add_value(std::string_view name, T val)
 {
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
index 5575c931adc..c237e8ea207 100644
--- a/src/common/Formatter.h
+++ b/src/common/Formatter.h
@@ -232,6 +232,7 @@ private:
     void print_quoted_string(std::string_view s);
     void print_name(std::string_view name);
     void print_comma(json_formatter_stack_entry_d& entry);
+    void add_value(std::string_view name, double val);
 
     template <class T>
     void add_value(std::string_view name, T val);
diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc
index 54442709229..246cec9460b 100644
--- a/src/common/HeartbeatMap.cc
+++ b/src/common/HeartbeatMap.cc
@@ -43,11 +43,11 @@ HeartbeatMap::~HeartbeatMap()
   ceph_assert(m_workers.empty());
 }
 
-heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
+heartbeat_handle_d *HeartbeatMap::add_worker(string&& name, pthread_t thread_id)
 {
   std::unique_lock locker{m_rwlock};
   ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
-  heartbeat_handle_d *h = new heartbeat_handle_d(name);
+  heartbeat_handle_d *h = new heartbeat_handle_d(std::move(name));
   ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
                              "heartbeat_handle_d timeout");
   ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h
index 6f486b21ca8..401042cc271 100644
--- a/src/common/HeartbeatMap.h
+++ b/src/common/HeartbeatMap.h
@@ -48,15 +48,15 @@ struct heartbeat_handle_d {
   ceph::timespan suicide_grace = ceph::timespan::zero();
   std::list<heartbeat_handle_d*>::iterator list_item;
 
-  explicit heartbeat_handle_d(const std::string& n)
-    : name(n)
+  explicit heartbeat_handle_d(std::string&& n)
+    : name(std::move(n))
   { }
 };
 
 class HeartbeatMap {
  public:
   // register/unregister
-  heartbeat_handle_d *add_worker(const std::string& name, pthread_t thread_id);
+  heartbeat_handle_d *add_worker(std::string&& name, pthread_t thread_id);
   void remove_worker(const heartbeat_handle_d *h);
 
   // reset the timeout so that it expects another touch within grace amount of time
diff --git a/src/common/LRUSet.h b/src/common/LRUSet.h
index b62956ba460..c8c66e85458 100644
--- a/src/common/LRUSet.h
+++ b/src/common/LRUSet.h
@@ -43,6 +43,7 @@ class LRUSet {
   // lru
   boost::intrusive::list<
     Node,
+    boost::intrusive::constant_time_size<false>,
     boost::intrusive::member_hook<Node,
 				  boost::intrusive::list_member_hook<>,
 				  &Node::lru_item>
diff --git a/src/common/Preforker.h b/src/common/Preforker.h
index d34179b4020..d25d5dd5ada 100644
--- a/src/common/Preforker.h
+++ b/src/common/Preforker.h
@@ -126,7 +126,7 @@ public:
     }
     return r;
   }
-  void exit(int r) {
+  [[noreturn]] void exit(int r) {
     if (is_child())
         signal_exit(r);
     ::exit(r);
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
index ec9cbdf53a6..f82a70701d2 100644
--- a/src/common/SloppyCRCMap.cc
+++ b/src/common/SloppyCRCMap.cc
@@ -73,7 +73,7 @@ void SloppyCRCMap::truncate(uint64_t offset)
   offset -= offset % block_size;
   std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
   while (p != crc_map.end())
-    crc_map.erase(p++);
+    p = crc_map.erase(p);
 }
 
 void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 9a7a31923c1..3903e8c0ed7 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -83,7 +83,7 @@ void *Thread::entry_wrapper()
   if (pid && cpuid >= 0)
     _set_affinity(cpuid);
 
-  ceph_pthread_setname(pthread_self(), thread_name.c_str());
+  ceph_pthread_setname(pthread_self(), Thread::thread_name.c_str());
   return entry();
 }
 
@@ -154,7 +154,7 @@ int Thread::try_create(size_t stacksize)
 void Thread::create(const char *name, size_t stacksize)
 {
   ceph_assert(strlen(name) < 16);
-  thread_name = name;
+  Thread::thread_name = name;
 
   int ret = try_create(stacksize);
   if (ret != 0) {
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 5242fb5f307..d3892c1b36b 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -20,11 +20,14 @@
 #include <string_view>
 #include <system_error>
 #include <thread>
+#include <cstring>
 
 #include <pthread.h>
 #include <sys/types.h>
 
+#include "include/ceph_assert.h"
 #include "include/compat.h"
+#include "include/spinlock.h"
 
 extern pid_t ceph_gettid();
 
@@ -33,7 +36,7 @@ class Thread {
   pthread_t thread_id;
   pid_t pid;
   int cpuid;
-  std::string thread_name;
+  static inline thread_local std::string thread_name;
 
   void *entry_wrapper();
 
@@ -61,6 +64,9 @@ class Thread {
   int join(void **prval = 0);
   int detach();
   int set_affinity(int cpuid);
+  static const std::string get_thread_name() {
+    return Thread::thread_name;
+  }
 };
 
 // Functions for with std::thread
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index a6467bcaaca..b888d933480 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -204,7 +204,7 @@ void OpHistory::dump_slow_ops(utime_t now, Formatter *f, set<string> filters)
   cleanup(now);
   f->open_object_section("OpHistory slow ops");
   f->dump_int("num to keep", history_slow_op_size.load());
-  f->dump_int("threshold to keep", history_slow_op_threshold.load());
+  f->dump_float("threshold to keep", history_slow_op_threshold.load());
   {
     f->open_array_section("Ops");
     for ([[maybe_unused]] const auto& [t, op] : slow_op) {
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 681301252de..57d73038364 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -68,7 +68,7 @@ class OpHistory {
   std::atomic_size_t history_size{0};
   std::atomic_uint32_t history_duration{0};
   std::atomic_size_t history_slow_op_size{0};
-  std::atomic_uint32_t history_slow_op_threshold{0};
+  std::atomic<float> history_slow_op_threshold{0};
   std::atomic_bool shutdown{false};
   OpHistoryServiceThread opsvc;
   friend class OpHistoryServiceThread;
@@ -113,7 +113,7 @@ public:
     history_size = new_size;
     history_duration = new_duration;
   }
-  void set_slow_op_size_and_threshold(size_t new_size, uint32_t new_threshold) {
+  void set_slow_op_size_and_threshold(size_t new_size, float new_threshold) {
     history_slow_op_size = new_size;
     history_slow_op_threshold = new_threshold;
   }
@@ -144,7 +144,7 @@ public:
   void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
     history.set_size_and_duration(new_size, new_duration);
   }
-  void set_history_slow_op_size_and_threshold(uint32_t new_size, uint32_t new_threshold) {
+  void set_history_slow_op_size_and_threshold(uint32_t new_size, float new_threshold) {
     history.set_slow_op_size_and_threshold(new_size, new_threshold);
   }
   bool is_tracking() const {
@@ -243,6 +243,7 @@ private:
 public:
   typedef boost::intrusive::list<
   TrackedOp,
+  boost::intrusive::constant_time_size<false>,
   boost::intrusive::member_hook<
     TrackedOp,
     boost::intrusive::list_member_hook<>,
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index f1877647877..6a02d5c5bf1 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -282,10 +282,20 @@ public:
   void set_mon_addrs(const MonMap& mm);
   void set_mon_addrs(const std::vector<entity_addrvec_t>& in) {
     auto ptr = std::make_shared<std::vector<entity_addrvec_t>>(in);
+#if defined(__GNUC__) && __GNUC__ < 12
+    // workaround for GCC 11 bug
     atomic_store_explicit(&_mon_addrs, std::move(ptr), std::memory_order_relaxed);
+#else
+    _mon_addrs.store(std::move(ptr), std::memory_order_relaxed);
+#endif
   }
   std::shared_ptr<std::vector<entity_addrvec_t>> get_mon_addrs() const {
+#if defined(__GNUC__) && __GNUC__ < 12
+    // workaround for GCC 11 bug
     auto ptr = atomic_load_explicit(&_mon_addrs, std::memory_order_relaxed);
+#else
+    auto ptr = _mon_addrs.load(std::memory_order_relaxed);
+#endif
     return ptr;
   }
 
@@ -306,7 +316,12 @@ private:
 
   int _crypto_inited;
 
+#if defined(__GNUC__) && __GNUC__ < 12
+  // workaround for GCC 11 bug
   std::shared_ptr<std::vector<entity_addrvec_t>> _mon_addrs;
+#else
+  std::atomic<std::shared_ptr<std::vector<entity_addrvec_t>>> _mon_addrs;
+#endif
 
   /* libcommon service thread.
    * SIGHUP wakes this thread, which then reopens logfiles */
diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h
index 059d81f2ac3..6ed8c56d5da 100644
--- a/src/common/ceph_mutex.h
+++ b/src/common/ceph_mutex.h
@@ -83,7 +83,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
   #define ceph_mutex_is_locked(m) true
   #define ceph_mutex_is_locked_by_me(m) true
 }
@@ -131,8 +130,6 @@ namespace ceph {
     return {std::forward<Args>(args)...};
   }
 
-  static constexpr bool mutex_debugging = true;
-
   // debug methods
   #define ceph_mutex_is_locked(m) ((m).is_locked())
   #define ceph_mutex_is_not_locked(m) (!(m).is_locked())
@@ -186,8 +183,6 @@ namespace ceph {
     return {};
   }
 
-  static constexpr bool mutex_debugging = false;
-
   // debug methods.  Note that these can blindly return true
   // because any code that does anything other than assert these
   // are true is broken.
diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
index af2baaa5c67..86ced8d183c 100644
--- a/src/common/cohort_lru.h
+++ b/src/common/cohort_lru.h
@@ -15,6 +15,12 @@
 
 #include <boost/intrusive/list.hpp>
 #include <boost/intrusive/slist.hpp>
+#include <cstdint>
+#include <atomic>
+#include <mutex>
+#include <algorithm>
+#include <functional>
+#include <vector>
 
 #ifdef __CEPH__
 # include "include/ceph_assert.h"
diff --git a/src/common/config.cc b/src/common/config.cc
index e151e94bb90..3a5ee91c347 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -24,6 +24,8 @@
 #include "common/hostname.h"
 #include "common/dout.h"
 
+#include <fmt/core.h>
+
 /* Don't use standard Ceph logging in this file.
  * We can't use logging until it's initialized, and a lot of the necessary
  * initialization happens here.
@@ -131,14 +133,11 @@ md_config_t::md_config_t(ConfigValues& values,
   // Define the debug_* options as well.
   subsys_options.reserve(values.subsys.get_num());
   for (unsigned i = 0; i < values.subsys.get_num(); ++i) {
-    string name = string("debug_") + values.subsys.get_name(i);
-    subsys_options.push_back(
-      Option(name, Option::TYPE_STR, Option::LEVEL_ADVANCED));
+    subsys_options.emplace_back(
+      fmt::format("debug_{}", values.subsys.get_name(i)), Option::TYPE_STR, Option::LEVEL_ADVANCED);
     Option& opt = subsys_options.back();
-    opt.set_default(stringify(values.subsys.get_log_level(i)) + "/" +
-		    stringify(values.subsys.get_gather_level(i)));
-    string desc = string("Debug level for ") + values.subsys.get_name(i);
-    opt.set_description(desc.c_str());
+    opt.set_default(fmt::format("{}/{}", values.subsys.get_log_level(i), values.subsys.get_gather_level(i)));
+    opt.set_description(fmt::format("Debug level for {}", values.subsys.get_name(i)).c_str());
     opt.set_flag(Option::FLAG_RUNTIME);
     opt.set_long_description("The value takes the form 'N' or 'N/M' where N and M are values between 0 and 99.  N is the debug level to log (all values below this are included), and M is the level to gather and buffer in memory.  In the event of a crash, the most recent items <= M are dumped to the log file.");
     opt.set_subsys(i);
@@ -158,7 +157,7 @@ md_config_t::md_config_t(ConfigValues& values,
 	  } else {
 	    // normalize to M/N
 	    n = m;
-	    *value = stringify(m) + "/" + stringify(n);
+	    *value = fmt::format("{}/{}", m, n);
 	  }
 	} else {
 	  *error_message = "value must take the form N or N/M, where N and M are integers";
@@ -775,7 +774,7 @@ int md_config_t::parse_option(ConfigValues& values,
     option_name = opt.name;
     if (ceph_argparse_witharg(
 	  args, i, &val, err,
-	  string(string("--default-") + opt.name).c_str(), (char*)NULL)) {
+	  fmt::format("--default-{}", opt.name).c_str(), (char*)NULL)) {
       if (!err.str().empty()) {
         error_message = err.str();
 	ret = -EINVAL;
@@ -1268,7 +1267,7 @@ Option::value_t md_config_t::_expand_meta(
 		     << Option::to_str(*i->second) << "\n";
 	      }
 	    }
-	    return Option::value_t(std::string("$") + o->name);
+	    return Option::value_t(fmt::format("${}", o->name));
 	  } else {
 	    // recursively evaluate!
 	    string n;
diff --git a/src/common/config_obs_mgr.h b/src/common/config_obs_mgr.h
index 759930df92d..5336538e438 100644
--- a/src/common/config_obs_mgr.h
+++ b/src/common/config_obs_mgr.h
@@ -75,7 +75,7 @@ typename ObserverMgr<ConfigObs>::config_obs_wptr ObserverMgr<ConfigObs>::remove_
   for (auto o = observers.begin(); o != observers.end(); ) {
     if (*o->second == observer) {
       ptr = std::move(o->second);
-      observers.erase(o++);
+      o = observers.erase(o);
       found_obs = true;
     } else {
       ++o;
diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h
index b9b47d9cef4..12a273b8c84 100644
--- a/src/common/config_proxy.h
+++ b/src/common/config_proxy.h
@@ -31,7 +31,6 @@ class ConfigProxy {
   using rev_obs_map_t = ObsMgr::rev_obs_map;
 
   void _call_observers(rev_obs_map_t& rev_obs) {
-    ceph_assert(!ceph::mutex_debugging || !ceph_mutex_is_locked_by_me(lock));
     for (auto& [obs, keys] : rev_obs) {
       (*obs)->handle_conf_change(*this, keys);
     }
diff --git a/src/common/intrusive_lru.h b/src/common/intrusive_lru.h
index 564cceef1cc..3ed3625d8a0 100644
--- a/src/common/intrusive_lru.h
+++ b/src/common/intrusive_lru.h
@@ -125,6 +125,7 @@ class intrusive_lru {
 
   using lru_list_t = boost::intrusive::list<
     base_t,
+    boost::intrusive::constant_time_size<false>,
     boost::intrusive::member_hook<
       base_t,
       boost::intrusive::list_member_hook<>,
diff --git a/src/common/io_exerciser/CMakeLists.txt b/src/common/io_exerciser/CMakeLists.txt
new file mode 100644
index 00000000000..07091df86e1
--- /dev/null
+++ b/src/common/io_exerciser/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_library(object_io_exerciser STATIC
+  DataGenerator.cc
+  IoOp.cc
+  IoSequence.cc
+  Model.cc
+  ObjectModel.cc
+  RadosIo.cc
+)
+
+target_link_libraries(object_io_exerciser
+  librados 
+  global
+)
+\ No newline at end of file
diff --git a/src/common/io_exerciser/DataGenerator.cc b/src/common/io_exerciser/DataGenerator.cc
new file mode 100644
index 00000000000..9aa77eeb6e9
--- /dev/null
+++ b/src/common/io_exerciser/DataGenerator.cc
@@ -0,0 +1,753 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "DataGenerator.h"
+
+#include "ObjectModel.h"
+
+#include "common/debug.h"
+#include "common/dout.h"
+
+#include "fmt/format.h"
+#include "fmt/ranges.h"
+
+#include <chrono>
+#include <iostream>
+#include <stdexcept>
+
+#define dout_subsys ceph_subsys_rados
+#define dout_context g_ceph_context
+
+using DataGenerator = ceph::io_exerciser::data_generation::DataGenerator;
+using SeededRandomGenerator = ceph::io_exerciser::data_generation
+                                ::SeededRandomGenerator;
+using HeaderedSeededRandomGenerator = ceph::io_exerciser::data_generation
+                                        ::HeaderedSeededRandomGenerator;
+
+std::unique_ptr<DataGenerator> DataGenerator::create_generator(
+    GenerationType generationType, const ObjectModel& model)
+{
+  switch(generationType)
+  {
+    case GenerationType::SeededRandom:
+      return std::make_unique<SeededRandomGenerator>(model);
+    case GenerationType::HeaderedSeededRandom:
+      return std::make_unique<HeaderedSeededRandomGenerator>(model);
+    default:
+      throw std::invalid_argument("Not yet implemented");
+  }
+
+  return nullptr;
+}
+
+bufferlist DataGenerator::generate_wrong_data(uint64_t offset, uint64_t length)
+{
+  bufferlist retlist;
+  uint64_t block_size = m_model.get_block_size();
+  char buffer[block_size];
+  for (uint64_t block_offset = offset;
+       block_offset < offset + length;
+       block_offset++)
+  {
+    std::memset(buffer, 0, block_size);
+    retlist.append(ceph::bufferptr(buffer, block_size));
+  }
+  return retlist;
+}
+
+bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, uint64_t length)
+{
+  return bufferlist.contents_equal(generate_data(offset, length));
+}
+
+ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset)
+{
+  uint64_t block_size = m_model.get_block_size();
+  char buffer[block_size];
+
+  std::mt19937_64 random_generator(m_model.get_seed(block_offset));
+  uint64_t rand1 = random_generator();
+  uint64_t rand2 = random_generator();
+
+  constexpr size_t generation_length = sizeof(uint64_t);
+
+  for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--)
+  {
+    std::memcpy(buffer + i, &rand1, generation_length);
+    std::memcpy(buffer + i + generation_length, &rand2, generation_length);
+  }
+
+  size_t remainingBytes = block_size % (generation_length * 2);
+  if (remainingBytes > generation_length)
+  {
+    size_t remainingBytes2 = remainingBytes - generation_length;
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+    std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2);
+  }
+  else if (remainingBytes > 0)
+  {
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+  }
+
+  return ceph::bufferptr(buffer, block_size);
+}
+
+ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offset)
+{
+  uint64_t block_size = m_model.get_block_size();
+  char buffer[block_size];
+
+  std::mt19937_64 random_generator(m_model.get_seed(block_offset));
+  uint64_t rand1 = random_generator() - 1;
+  uint64_t rand2 = random_generator() + 1;
+
+  constexpr size_t generation_length = sizeof(uint64_t);
+
+  for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--)
+  {
+    std::memcpy(buffer + i, &rand1, generation_length);
+    std::memcpy(buffer + i + generation_length, &rand2, generation_length);
+  }
+
+  size_t remainingBytes = block_size % (generation_length * 2);
+  if (remainingBytes > generation_length)
+  {
+    size_t remainingBytes2 = remainingBytes - generation_length;
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+    std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2);
+  }
+  else if (remainingBytes > 0)
+  {
+    std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
+  }
+
+  return ceph::bufferptr(buffer, block_size);
+}
+
+bufferlist SeededRandomGenerator::generate_data(uint64_t offset, uint64_t length)
+{
+  bufferlist retlist;
+
+  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
+  {
+    retlist.append(generate_block(block_offset));
+  }
+
+  return retlist;
+}
+
+bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, uint64_t length)
+{
+  bufferlist retlist;
+
+  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
+  {
+    retlist.append(generate_wrong_block(block_offset));
+  }
+
+  return retlist;
+}
+
+HeaderedSeededRandomGenerator
+  ::HeaderedSeededRandomGenerator(const ObjectModel& model,
+                                  std::optional<uint64_t> unique_run_id) :
+    SeededRandomGenerator(model),
+    unique_run_id(unique_run_id.value_or(generate_unique_run_id()))
+{
+
+}
+
+uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id()
+{
+  std::mt19937_64 random_generator =
+        std::mt19937_64(duration_cast<std::chrono::milliseconds>(
+          std::chrono::system_clock::now().time_since_epoch()).count());
+
+      return random_generator();
+}
+
+ceph::bufferptr HeaderedSeededRandomGenerator::generate_block(uint64_t block_offset)
+{
+  SeedBytes seed = m_model.get_seed(block_offset);
+  TimeBytes current_time = duration_cast<std::chrono::milliseconds>(
+      std::chrono::system_clock::now().time_since_epoch()).count();
+
+  ceph::bufferptr bufferptr = SeededRandomGenerator::generate_block(block_offset);
+
+  std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, uniqueIdLength());
+  std::memcpy(bufferptr.c_str() + seedStart(), &seed, seedLength());
+  std::memcpy(bufferptr.c_str() + timeStart(), &current_time, timeLength());
+
+  return bufferptr;
+}
+
+ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block(uint64_t block_offset)
+{
+  return HeaderedSeededRandomGenerator::generate_block(block_offset % 8);
+}
+
+const HeaderedSeededRandomGenerator::UniqueIdBytes
+  HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset,
+                                                 const bufferlist& bufferlist)
+{
+  UniqueIdBytes read_unique_run_id = 0;
+  std::memcpy(&read_unique_run_id,
+              &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()],
+              uniqueIdLength());
+  return read_unique_run_id;
+}
+
+const HeaderedSeededRandomGenerator::SeedBytes
+  HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset,
+                                          const bufferlist& bufferlist)
+{
+  SeedBytes read_seed = 0;
+  std::memcpy(&read_seed,
+              &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()],
+              seedLength());
+  return read_seed;
+}
+
+const HeaderedSeededRandomGenerator::TimeBytes
+  HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset,
+                                              const bufferlist& bufferlist)
+{
+  TimeBytes read_time = 0;
+  std::memcpy(&read_time,
+              &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()],
+              timeLength());
+  return read_time;
+}
+
+bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist,
+                                             uint64_t offset, uint64_t length)
+{
+  std::vector<uint64_t> invalid_block_offsets;
+
+  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
+  {
+    bool valid_block
+      = validate_block(block_offset,
+                       (bufferlist.c_str() + ((block_offset - offset) *
+                       m_model.get_block_size())));
+    if (!valid_block)
+    {
+      invalid_block_offsets.push_back(block_offset);
+    }
+  }
+
+  if (!invalid_block_offsets.empty())
+  {
+    printDebugInformationForOffsets(offset, invalid_block_offsets, bufferlist);
+  }
+
+  return invalid_block_offsets.empty();
+}
+
+bool HeaderedSeededRandomGenerator::validate_block(uint64_t block_offset,
+                                                   const char* buffer_start)
+{
+  // We validate the block matches what we generate byte for byte
+  // however we ignore the time section of the header
+  ceph::bufferptr bufferptr = generate_block(block_offset);
+  bool valid = strncmp(bufferptr.c_str(), buffer_start, timeStart()) == 0;
+  valid = valid ? strncmp(bufferptr.c_str() + timeEnd(),
+                          buffer_start + timeEnd(),
+                          m_model.get_block_size() - timeEnd()) == 0 : valid;
+  return valid;
+}
+
+const HeaderedSeededRandomGenerator::ErrorType
+  HeaderedSeededRandomGenerator::getErrorTypeForBlock(uint64_t read_offset,
+                                                      uint64_t block_offset,
+                                                      const bufferlist& bufferlist)
+{
+  try
+  {
+    UniqueIdBytes read_unique_run_id = readUniqueRunId(block_offset - read_offset,
+                                                       bufferlist);
+    if (unique_run_id != read_unique_run_id)
+    {
+      return ErrorType::RUN_ID_MISMATCH;
+    }
+
+    SeedBytes read_seed = readSeed(block_offset - read_offset, bufferlist);
+    if (m_model.get_seed(block_offset) != read_seed)
+    {
+      return ErrorType::SEED_MISMATCH;
+    }
+
+    if (std::strncmp(&bufferlist[((block_offset - read_offset) *
+                      m_model.get_block_size()) + bodyStart()],
+                     generate_block(block_offset).c_str() + bodyStart(),
+                     m_model.get_block_size() - bodyStart()) != 0)
+    {
+      return ErrorType::DATA_MISMATCH;
+    }
+  }
+  catch(const std::exception& e)
+  {
+    return ErrorType::DATA_NOT_FOUND;
+  }
+
+  return ErrorType::UNKNOWN;
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForBlock(uint64_t read_offset, uint64_t block_offset,
+                                  const bufferlist& bufferlist)
+{
+  ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, bufferlist);
+
+  TimeBytes read_time = 0;
+  std::time_t ttp;
+
+  char read_bytes[m_model.get_block_size()];
+  char generated_bytes[m_model.get_block_size()];
+
+  if (blockError == ErrorType::DATA_MISMATCH || blockError == ErrorType::UNKNOWN)
+  {
+    read_time = readDateTime(block_offset - read_offset, bufferlist);
+    std::chrono::system_clock::time_point time_point{std::chrono::milliseconds{read_time}};
+    ttp = std::chrono::system_clock::to_time_t(time_point);
+
+    std::memcpy(&read_bytes,
+                &bufferlist[((block_offset - read_offset) * m_model.get_block_size())],
+                m_model.get_block_size() - bodyStart());
+    std::memcpy(&generated_bytes,
+                generate_block(block_offset).c_str(),
+                m_model.get_block_size() - bodyStart());
+  }
+
+  std::string error_string;
+  switch(blockError)
+  {
+    case ErrorType::RUN_ID_MISMATCH:
+    {
+      UniqueIdBytes read_unique_run_id = readUniqueRunId((block_offset - read_offset),
+                                                          bufferlist);
+      error_string = fmt::format("Header (Run ID) mismatch detected at block {} "
+        "(byte offset {}) Header expected run id {} but found id {}. "
+        "Block data corrupt or not written from this instance of this application.",
+      block_offset,
+      block_offset * m_model.get_block_size(),
+      unique_run_id,
+      read_unique_run_id);
+    }
+    break;
+
+    case ErrorType::SEED_MISMATCH:
+    {
+      SeedBytes read_seed = readSeed((block_offset - read_offset), bufferlist);
+
+      if (m_model.get_seed_offsets(read_seed).size() == 0)
+      {
+        error_string = fmt::format("Data (Seed) mismatch detected at block {}"
+          " (byte offset {}). Header expected seed {} but found seed {}. "
+          "Read data was not from any other recognised block in the object.",
+            block_offset,
+            block_offset * m_model.get_block_size(),
+            m_model.get_seed(block_offset),
+            read_seed);
+      }
+      else
+      {
+        std::vector<int> seed_offsets = m_model.get_seed_offsets(read_seed);
+        error_string = fmt::format("Data (Seed) mismatch detected at block {}"
+          " (byte offset {}). Header expected seed {} but found seed {}."
+          " Read data was from a different block(s): {}",
+            block_offset,
+            block_offset * m_model.get_block_size(),
+            m_model.get_seed(block_offset),
+            read_seed,
+            fmt::join(seed_offsets.begin(), seed_offsets.end(), ""));
+      }
+    }
+    break;
+
+    case ErrorType::DATA_MISMATCH:
+    {
+      error_string = fmt::format("Data (Body) mismatch detected at block {}"
+        " (byte offset {}). Header data matches, data body does not."
+        " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}",
+          block_offset,
+          block_offset * m_model.get_block_size(),
+          std::ctime(&ttp),
+          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""),
+          fmt::join(read_bytes, read_bytes + m_model.get_block_size(), ""));
+    }
+    break;
+
+    case ErrorType::DATA_NOT_FOUND:
+    {
+      uint64_t bufferlist_length = bufferlist.to_str().size();
+      error_string = fmt::format("Data (Body) could not be read at block {}"
+        " (byte offset {}) offset in bufferlist returned from read: {}"
+        " ({} bytes). Returned bufferlist length: {}.",
+          block_offset,
+          block_offset * m_model.get_block_size(),
+          (block_offset - read_offset),
+          (block_offset - read_offset) * m_model.get_block_size(),
+          bufferlist_length);
+    }
+    break;
+
+    case ErrorType::UNKNOWN:
+      [[ fallthrough ]];
+
+    default:
+    {
+      error_string = fmt::format("Data mismatch detected at block {}"
+        " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}",
+          block_offset,
+          block_offset * m_model.get_block_size(),
+          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""),
+          fmt::join(read_bytes, read_bytes + m_model.get_block_size(), ""));
+    }
+    break;
+  }
+  dout(0) << error_string << dendl;
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForRange(uint64_t read_offset,
+                                  uint64_t start_block_offset,
+                                  uint64_t range_length_in_blocks,
+                                  ErrorType rangeError,
+                                  const bufferlist& bufferlist)
+{
+  switch(rangeError)
+  {
+  case ErrorType::RUN_ID_MISMATCH:
+    printDebugInformationForRunIdMismatchRange(read_offset, start_block_offset,
+                                               range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::SEED_MISMATCH:
+    printDebugInformationForSeedMismatchRange(read_offset, start_block_offset,
+                                              range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::DATA_MISMATCH:
+    printDebugInformationDataBodyMismatchRange(read_offset, start_block_offset,
+                                               range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::DATA_NOT_FOUND:
+    printDebugInformationDataNotFoundRange(read_offset, start_block_offset,
+                                           range_length_in_blocks, bufferlist);
+    break;
+  case ErrorType::UNKNOWN:
+    [[ fallthrough ]];
+  default:
+    printDebugInformationCorruptRange(read_offset, start_block_offset,
+                                      range_length_in_blocks, bufferlist);
+    break;
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForRunIdMismatchRange(uint64_t read_offset,
+                                               uint64_t start_block_offset,
+                                               uint64_t range_length_in_blocks,
+                                               const bufferlist& bufferlist)
+{
+  uint64_t range_start = start_block_offset;
+  uint64_t range_length = 0;
+  UniqueIdBytes initial_read_unique_run_id = readUniqueRunId(start_block_offset - read_offset,
+                                                             bufferlist);
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist)
+                == ErrorType::RUN_ID_MISMATCH);
+
+    UniqueIdBytes read_unique_run_id = readUniqueRunId(i - read_offset, bufferlist);
+    if (initial_read_unique_run_id != read_unique_run_id ||
+        i == (start_block_offset + range_length_in_blocks - 1))
+    {
+      if (range_length == 1)
+      {
+        printDebugInformationForBlock(read_offset, i, bufferlist);
+      }
+      else if (range_length > 1)
+      {
+        dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {} ({} bytes)"
+                    " and spanning a range of {} blocks ({} bytes). "
+                    "Expected run id {} for range but found id {}"
+                    " for all blocks in range. "
+                    "Block data corrupt or not written from this instance of this application.",
+                      range_start,
+                      range_start * m_model.get_block_size(),
+                      range_length,
+                      range_length * m_model.get_block_size(),
+                      unique_run_id,
+                      initial_read_unique_run_id) << dendl;
+      }
+
+      range_start = i;
+      range_length = 1;
+      initial_read_unique_run_id = read_unique_run_id;
+    }
+    else
+    {
+      range_length++;
+    }
+  }
+
+  if (range_length == 1)
+  {
+    printDebugInformationForBlock(read_offset,
+                                  start_block_offset + range_length_in_blocks - 1,
+                                  bufferlist);
+  }
+  else if (range_length > 1)
+  {
+    dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {}"
+                " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                "Expected run id {} for range but found id for all blocks in range. "
+                "Block data corrupt or not written from this instance of this application.",
+                  range_start,
+                  range_start * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size(),
+                  unique_run_id,
+                  initial_read_unique_run_id)
+            << dendl;
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForSeedMismatchRange(uint64_t read_offset,
+                                              uint64_t start_block_offset,
+                                              uint64_t range_length_in_blocks,
+                                              const bufferlist& bufferlist)
+{
+  uint64_t range_start = start_block_offset;
+  uint64_t range_length = 0;
+
+  // Assert here if needed, as we can't support values
+  // that can't be converted to a signed integer.
+  ceph_assert(m_model.get_block_size() < (std::numeric_limits<uint64_t>::max() / 2));
+  std::optional<int64_t> range_offset = 0;
+
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist)
+                == ErrorType::SEED_MISMATCH);
+    SeedBytes read_seed = readSeed(i - read_offset, bufferlist);
+
+    std::vector<int> seed_found_offsets = m_model.get_seed_offsets(read_seed);
+
+    if ((seed_found_offsets.size() == 1 &&
+        (static_cast<int64_t>(seed_found_offsets.front() - i) == range_offset)) ||
+        range_length == 0)
+    {
+      if (range_length == 0)
+      {
+        range_start = i;
+        if (seed_found_offsets.size() > 0)
+        {
+          range_offset = seed_found_offsets.front() - i;
+        }
+        else
+        {
+          range_offset = std::nullopt;
+        }
+      }
+      range_length++;
+    }
+    else
+    {
+      if (range_length == 1)
+      {
+        printDebugInformationForBlock(read_offset, i - 1, bufferlist);
+      }
+      else if (range_length > 1 && range_offset.has_value())
+      {
+        dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}"
+                    " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                    "Returned data located starting from block {} ({} bytes) "
+                    "and spanning a range of {} blocks ({} bytes).",
+                      range_start,
+                      range_start * m_model.get_block_size(),
+                      range_length, range_length * m_model.get_block_size(),
+                      static_cast<uint64_t>(*range_offset) + range_start,
+                      (static_cast<uint64_t>(*range_offset) + range_start)
+                        * m_model.get_block_size(),
+                      range_length,
+                      range_length * m_model.get_block_size())
+                << dendl;
+      }
+      else
+      {
+        dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}"
+                    " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                    "Data seed mismatch spanning a range of {} blocks ({} bytes).",
+                      range_start,
+                      range_start * m_model.get_block_size(),
+                      range_length, range_length * m_model.get_block_size(),
+                      range_length,
+                      range_length * m_model.get_block_size())
+                << dendl;
+      }
+      range_length = 1;
+      range_start = i;
+      if (seed_found_offsets.size() > 0)
+      {
+        range_offset = seed_found_offsets.front() - i;
+      }
+      else
+      {
+        range_offset = std::nullopt;
+      }
+    }
+  }
+
+  if (range_length == 1)
+  {
+    printDebugInformationForBlock(read_offset,
+                                  start_block_offset + range_length_in_blocks - 1,
+                                  bufferlist);
+  }
+  else if (range_length > 1 && range_offset.has_value())
+  {
+    dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) "
+                "and spanning a range of {} blocks ({} bytes). "
+                "Returned data located starting from block {} ({} bytes) "
+                "and spanning a range of {} blocks ({} bytes).",
+                  range_start,
+                  range_start * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size(),
+                  *range_offset + range_start,
+                  (*range_offset + range_start) * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size())
+            << dendl;
+  }
+  else
+  {
+    dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) "
+                "and spanning a range of {} blocks ({} bytes). "
+                "and spanning a range of {} blocks ({} bytes).",
+                  range_start,
+                  range_start * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size(),
+                  range_length,
+                  range_length * m_model.get_block_size())
+            << dendl;
+  }
+}
+
+void HeaderedSeededRandomGenerator
+::printDebugInformationDataBodyMismatchRange(uint64_t read_offset,
+                                             uint64_t start_block_offset,
+                                             uint64_t range_length_in_blocks,
+                                             const bufferlist& bufferlist)
+{
+  dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. "
+              "Headers look as expected for range, "
+              "but generated data body does not match. "
+              "More information given for individual blocks below.",
+                start_block_offset,
+                start_block_offset + range_length_in_blocks - 1)
+          << dendl;
+
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    printDebugInformationForBlock(read_offset, i, bufferlist);
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationCorruptRange(uint64_t read_offset,
+                                      uint64_t start_block_offset,
+                                      uint64_t range_length_in_blocks,
+                                      const bufferlist& bufferlist)
+{
+  dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. "
+              "Headers look as expected for range, "
+              "but generated data body does not match. "
+              "More information given for individual blocks below.",
+                start_block_offset,
+                start_block_offset + range_length_in_blocks - 1)
+          << dendl;
+
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++)
+  {
+    printDebugInformationForBlock(read_offset, i, bufferlist);
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationDataNotFoundRange(uint64_t read_offset,
+                                           uint64_t start_block_offset,
+                                           uint64_t range_length_in_blocks,
+                                           const bufferlist& bufferlist)
+{
+  dout(0) << fmt::format("Data not found for blocks from {} to {}. "
+              "More information given for individual blocks below.",
+                start_block_offset,
+                start_block_offset + range_length_in_blocks - 1)
+          << dendl;
+
+  for (uint64_t i = start_block_offset; i < start_block_offset + range_length_in_blocks; i++)
+  {
+    printDebugInformationForBlock(read_offset, i, bufferlist);
+  }
+}
+
+void HeaderedSeededRandomGenerator
+  ::printDebugInformationForOffsets(uint64_t read_offset,
+                                    std::vector<uint64_t> offsets,
+                                    const bufferlist& bufferlist)
+{
+  uint64_t range_start = 0;
+  uint64_t range_length = 0;
+  ErrorType rangeError = ErrorType::UNKNOWN;
+
+  for (const uint64_t& block_offset : offsets)
+  {
+    ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset,
+                                                bufferlist);
+
+    if (range_start == 0 && range_length == 0)
+    {
+      range_start = block_offset;
+      range_length = 1;
+      rangeError = blockError;
+    }
+    else if (blockError == rangeError &&
+             range_start + range_length == block_offset)
+{
+      range_length++;
+    }
+    else
+    {
+      if (range_length == 1)
+      {
+        printDebugInformationForBlock(read_offset, range_start, bufferlist);
+      }
+      else if (range_length > 1)
+      {
+        printDebugInformationForRange(read_offset, range_start, range_length,
+                                      rangeError, bufferlist);
+      }
+
+      range_start = block_offset;
+      range_length = 1;
+      rangeError = blockError;
+    }
+  }
+
+  if (range_length == 1)
+  {
+    printDebugInformationForBlock(read_offset, range_start, bufferlist);
+  }
+  else if (range_length > 1)
+  {
+    printDebugInformationForRange(read_offset, range_start, range_length,
+                                  rangeError, bufferlist);
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/DataGenerator.h b/src/common/io_exerciser/DataGenerator.h
new file mode 100644
index 00000000000..1e5784a54cc
--- /dev/null
+++ b/src/common/io_exerciser/DataGenerator.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <memory>
+#include <random>
+
+#include "include/buffer.h"
+#include "ObjectModel.h"
+
+/* Overview
+ *
+ * class DataGenerator
+ *   Generates data buffers for write I/Os using state queried
+ *   from ObjectModel. Validates data buffers for read I/Os
+ *   against the state in the ObjectModel. If a data miscompare
+ *   is detected provide debug information about the state of the
+ *   object, the buffer that was read and the expected buffer.
+ *
+ *
+ * class SeededRandomGenerator
+ *   Inherits from DataGenerator. Generates entirely random patterns
+ *   based on the seed retrieved by the model.
+ *
+ *
+ * class HeaderedSeededRandomGenerator
+ *   Inherits from SeededDataGenerator. Generates entirely random patterns
+ *   based on the seed retrieved by the model, however also appends a 
+ *   header to the start of each block. This generator also provides
+ *   a range of verbose debug options to help disagnose a miscompare
+ *   whenever it detects unexpected data.
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+    namespace data_generation {
+      enum class GenerationType {
+        SeededRandom,
+        HeaderedSeededRandom
+        // CompressedGenerator
+        // MixedGenerator
+      };
+
+      class DataGenerator {
+      public:
+        virtual ~DataGenerator() = default;
+        static std::unique_ptr<DataGenerator>
+          create_generator(GenerationType generatorType,
+                           const ObjectModel& model);
+        virtual bufferlist generate_data(uint64_t length, uint64_t offset)=0;
+        virtual bool validate(bufferlist& bufferlist, uint64_t offset,
+                              uint64_t length);
+
+        // Used for testing debug outputs from data generation
+        virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length);
+
+      protected:
+        const ObjectModel& m_model;
+
+        DataGenerator(const ObjectModel& model) : m_model(model) {}
+      };
+
+      class SeededRandomGenerator : public DataGenerator
+      {
+        public:
+          SeededRandomGenerator(const ObjectModel& model)
+            : DataGenerator(model) {}
+
+          virtual bufferptr generate_block(uint64_t offset);
+          virtual bufferlist generate_data(uint64_t length, uint64_t offset);
+          virtual bufferptr generate_wrong_block(uint64_t offset);
+          virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length) override;
+      };
+
+      class HeaderedSeededRandomGenerator : public SeededRandomGenerator
+      {
+        public:
+          HeaderedSeededRandomGenerator(const ObjectModel& model,
+                                        std::optional<uint64_t> unique_run_id = std::nullopt);
+
+          bufferptr generate_block(uint64_t offset) override;
+          bufferptr generate_wrong_block(uint64_t offset) override;
+          bool validate(bufferlist& bufferlist, uint64_t offset,
+                        uint64_t length) override;
+
+        private:
+          using UniqueIdBytes = uint64_t;
+          using SeedBytes = int;
+          using TimeBytes = uint64_t;
+
+          enum class ErrorType {
+            RUN_ID_MISMATCH,
+            SEED_MISMATCH,
+            DATA_MISMATCH,
+            DATA_NOT_FOUND,
+            UNKNOWN
+          };
+
+          constexpr uint8_t headerStart() const
+            { return 0; };
+          constexpr uint8_t uniqueIdStart() const
+            { return headerStart(); };
+          constexpr uint8_t uniqueIdLength() const
+            { return sizeof(UniqueIdBytes); };
+          constexpr uint8_t seedStart() const
+            { return uniqueIdStart() + uniqueIdLength(); };
+          constexpr uint8_t seedLength() const
+            { return sizeof(SeedBytes); };
+          constexpr uint8_t timeStart() const
+            { return seedStart() + seedLength(); };
+          constexpr uint8_t timeLength() const
+            { return sizeof(TimeBytes); };
+          constexpr uint8_t timeEnd() const
+            { return timeStart() + timeLength(); };
+          constexpr uint8_t headerLength() const
+            { return uniqueIdLength() + seedLength() + timeLength(); };
+          constexpr uint8_t bodyStart() const
+            { return headerStart() + headerLength(); };
+
+          const UniqueIdBytes readUniqueRunId(uint64_t block_offset,
+                                              const bufferlist& bufferlist);
+          const SeedBytes readSeed(uint64_t block_offset,
+                                   const bufferlist& bufferlist);
+          const TimeBytes readDateTime(uint64_t block_offset,
+                                       const bufferlist& bufferlist);
+
+          const UniqueIdBytes unique_run_id;
+
+          uint64_t generate_unique_run_id();
+
+          bool validate_block(uint64_t block_offset, const char* buffer_start);
+
+          const ErrorType getErrorTypeForBlock(uint64_t read_offset,
+                                               uint64_t block_offset,
+                                               const bufferlist& bufferlist);
+
+          void printDebugInformationForBlock(uint64_t read_offset,
+                                             uint64_t block_offset,
+                                             const bufferlist& bufferlist);
+          void printDebugInformationForRange(uint64_t read_offset,
+                                             uint64_t start_block_offset,
+                                             uint64_t range_length_in_blocks,
+                                             ErrorType rangeError,
+                                             const bufferlist& bufferlist);
+
+          void printDebugInformationForRunIdMismatchRange(uint64_t read_offset,
+                                                          uint64_t start_block_offset,
+                                                          uint64_t range_length_in_blocks,
+                                                          const bufferlist& bufferlist);
+          void printDebugInformationForSeedMismatchRange(uint64_t read_offset,
+                                                         uint64_t start_block_offset,
+                                                         uint64_t range_length_in_blocks,
+                                                         const bufferlist& bufferlist);
+          void printDebugInformationDataBodyMismatchRange(uint64_t read_offset,
+                                                          uint64_t start_block_offset,
+                                                          uint64_t range_length_in_blocks,
+                                                          const bufferlist& bufferlist);
+          void printDebugInformationDataNotFoundRange(uint64_t ßread_offset,
+                                                      uint64_t start_block_offset,
+                                                      uint64_t range_length_in_blocks,
+                                                      const bufferlist& bufferlist);
+          void printDebugInformationCorruptRange(uint64_t read_offset,
+                                                 uint64_t start_block_offset,
+                                                 uint64_t range_length_in_blocks,
+                                                 const bufferlist& bufferlist);
+
+          void printDebugInformationForOffsets(uint64_t read_offset,
+                                               std::vector<uint64_t> offsets,
+                                               const bufferlist& bufferlist);
+      };
+    }
+  }
+}
diff --git a/src/common/io_exerciser/IoOp.cc b/src/common/io_exerciser/IoOp.cc
new file mode 100644
index 00000000000..cd855ba6fff
--- /dev/null
+++ b/src/common/io_exerciser/IoOp.cc
@@ -0,0 +1,188 @@
+#include "IoOp.h"
+
+using IoOp = ceph::io_exerciser::IoOp;
+
+IoOp::IoOp( OpType op,
+            uint64_t offset1, uint64_t length1,
+            uint64_t offset2, uint64_t length2,
+            uint64_t offset3, uint64_t length3) :
+  op(op),
+  offset1(offset1), length1(length1),
+  offset2(offset2), length2(length2),
+  offset3(offset3), length3(length3)
+{
+
+}
+
+std::string IoOp::value_to_string(uint64_t v) const
+{
+  if (v < 1024 || (v % 1024) != 0) {
+    return std::to_string(v);
+  }else if (v < 1024*1024 || (v % (1024 * 1024)) != 0 ) {
+    return std::to_string(v / 1024) + "K";
+  }else{
+    return std::to_string(v / 1024 / 1024) + "M";
+  }
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_done() {
+
+    return std::make_unique<IoOp>(OpType::Done);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_barrier() {
+
+  return std::make_unique<IoOp>(OpType::BARRIER);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_create(uint64_t size) {
+
+  return std::make_unique<IoOp>(OpType::CREATE,0,size);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_remove() {
+
+  return std::make_unique<IoOp>(OpType::REMOVE);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_read(uint64_t offset, uint64_t length) {
+
+  return std::make_unique<IoOp>(OpType::READ, offset, length);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_read2(uint64_t offset1, uint64_t length1,
+                   uint64_t offset2, uint64_t length2) {
+
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+
+  return std::make_unique<IoOp>(OpType::READ2,
+                                offset1, length1,
+                                offset2, length2);
+}
+
+std::unique_ptr<IoOp> IoOp
+  ::generate_read3(uint64_t offset1, uint64_t length1,
+                   uint64_t offset2, uint64_t length2,
+                   uint64_t offset3, uint64_t length3) {
+
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+  if (offset1 < offset3) {
+    ceph_assert( offset1 + length1 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset1 );
+  }
+  if (offset2 < offset3) {
+    ceph_assert( offset2 + length2 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset2 );
+  }
+  return std::make_unique<IoOp>(OpType::READ3,
+                                offset1, length1,
+                                offset2, length2,
+                                offset3, length3);
+}
+
+std::unique_ptr<IoOp> IoOp::generate_write(uint64_t offset, uint64_t length) {
+  return std::make_unique<IoOp>(OpType::WRITE, offset, length);
+}
+
+std::unique_ptr<IoOp> IoOp::generate_write2(uint64_t offset1, uint64_t length1,
+                                            uint64_t offset2, uint64_t length2) {
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+  return std::make_unique<IoOp>(OpType::WRITE2,
+                                offset1, length1,
+                                offset2, length2);
+}
+
+std::unique_ptr<IoOp> IoOp::generate_write3(uint64_t offset1, uint64_t length1, 
+                                            uint64_t offset2, uint64_t length2,
+                                            uint64_t offset3, uint64_t length3) {
+  if (offset1 < offset2) {
+    ceph_assert( offset1 + length1 <= offset2 );
+  } else {
+    ceph_assert( offset2 + length2 <= offset1 );
+  }
+  if (offset1 < offset3) {
+    ceph_assert( offset1 + length1 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset1 );
+  }
+  if (offset2 < offset3) {
+    ceph_assert( offset2 + length2 <= offset3 );
+  } else {
+    ceph_assert( offset3 + length3 <= offset2 );
+  }
+  return std::make_unique<IoOp>(OpType::WRITE3,
+                                offset1, length1,
+                                offset2, length2,
+                                offset3, length3);
+}
+
+bool IoOp::done() {
+  return (op == OpType::Done);
+}
+
+std::string IoOp::to_string(uint64_t block_size) const
+{
+  switch (op) {
+  case OpType::Done:
+    return "Done";
+  case OpType::BARRIER:
+    return "Barrier";
+  case OpType::CREATE:
+    return "Create (size=" + value_to_string(length1 * block_size) + ")";
+  case OpType::REMOVE:
+    return "Remove";
+  case OpType::READ:
+    return "Read (offset=" + value_to_string(offset1 * block_size) +
+           ",length=" + value_to_string(length1 * block_size) + ")";
+  case OpType::READ2:
+    return "Read2 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) + ")";
+  case OpType::READ3:
+    return "Read3 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) +
+           ",offset3=" + value_to_string(offset3 * block_size) +
+           ",length3=" + value_to_string(length3 * block_size) + ")";
+  case OpType::WRITE:
+    return "Write (offset=" + value_to_string(offset1 * block_size) +
+           ",length=" + value_to_string(length1 * block_size) + ")";
+  case OpType::WRITE2:
+    return "Write2 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) + ")";
+  case OpType::WRITE3:
+    return "Write3 (offset1=" + value_to_string(offset1 * block_size) +
+           ",length1=" + value_to_string(length1 * block_size) +
+           ",offset2=" + value_to_string(offset2 * block_size) +
+           ",length2=" + value_to_string(length2 * block_size) +
+           ",offset3=" + value_to_string(offset3 * block_size) +
+           ",length3=" + value_to_string(length3 * block_size) + ")";
+  default:
+    break;
+  }
+  return "Unknown";
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoOp.h b/src/common/io_exerciser/IoOp.h
new file mode 100644
index 00000000000..60c02a93d4e
--- /dev/null
+++ b/src/common/io_exerciser/IoOp.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <string>
+#include <memory>
+#include "include/ceph_assert.h"
+
+/* Overview
+ *
+ * enum OpType
+ *   Enumeration of different types of I/O operation
+ *
+ * class IoOp
+ *   Stores details for an I/O operation. Generated by IoSequences
+ *   and applied by IoExerciser's
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+
+    enum class OpType {
+      Done,       // End of I/O sequence
+      BARRIER,    // Barrier - all prior I/Os must complete
+      CREATE,     // Create object and pattern with data
+      REMOVE,     // Remove object
+      READ,       // Read
+      READ2,      // 2 Reads in one op
+      READ3,      // 3 Reads in one op
+      WRITE,      // Write
+      WRITE2,     // 2 Writes in one op
+      WRITE3      // 3 Writes in one op
+    };
+
+    class IoOp {
+    protected:
+      std::string value_to_string(uint64_t v) const;
+
+    public:
+      OpType op;
+      uint64_t offset1;
+      uint64_t length1;
+      uint64_t offset2;
+      uint64_t length2;
+      uint64_t offset3;
+      uint64_t length3;
+
+      IoOp( OpType op,
+            uint64_t offset1 = 0, uint64_t length1 = 0,
+            uint64_t offset2 = 0, uint64_t length2 = 0,
+            uint64_t offset3 = 0, uint64_t length3 = 0 );
+
+      static std::unique_ptr<IoOp> generate_done();
+
+      static std::unique_ptr<IoOp> generate_barrier();
+
+      static std::unique_ptr<IoOp> generate_create(uint64_t size);
+
+      static std::unique_ptr<IoOp> generate_remove();
+
+      static std::unique_ptr<IoOp> generate_read(uint64_t offset,
+                                                 uint64_t length);
+
+      static std::unique_ptr<IoOp> generate_read2(uint64_t offset1,
+                                                  uint64_t length1,
+                                                  uint64_t offset2,
+                                                  uint64_t length2);
+
+      static std::unique_ptr<IoOp> generate_read3(uint64_t offset1,
+                                                  uint64_t length1,
+                                                  uint64_t offset2,
+                                                  uint64_t length2,
+                                                  uint64_t offset3,
+                                                  uint64_t length3);
+
+      static std::unique_ptr<IoOp> generate_write(uint64_t offset,
+                                                  uint64_t length);
+
+      static std::unique_ptr<IoOp> generate_write2(uint64_t offset1,
+                                                   uint64_t length1,
+                                                   uint64_t offset2,
+                                                   uint64_t length2);
+
+      static std::unique_ptr<IoOp> generate_write3(uint64_t offset1,
+                                                   uint64_t length1,
+                                                   uint64_t offset2,
+                                                   uint64_t length2,
+                                                   uint64_t offset3,
+                                                   uint64_t length3);
+
+      bool done();
+
+      std::string to_string(uint64_t block_size) const;
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoSequence.cc b/src/common/io_exerciser/IoSequence.cc
new file mode 100644
index 00000000000..4a7ca0593d1
--- /dev/null
+++ b/src/common/io_exerciser/IoSequence.cc
@@ -0,0 +1,500 @@
+#include "IoSequence.h"
+
+using Sequence = ceph::io_exerciser::Sequence;
+using IoSequence = ceph::io_exerciser::IoSequence;
+
+std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& seq)
+{
+  switch (seq)
+  {
+    case Sequence::SEQUENCE_SEQ0:
+      os << "SEQUENCE_SEQ0";
+      break;
+    case Sequence::SEQUENCE_SEQ1:
+      os << "SEQUENCE_SEQ1";
+      break;
+    case Sequence::SEQUENCE_SEQ2:
+      os << "SEQUENCE_SEQ2";
+      break;
+    case Sequence::SEQUENCE_SEQ3:
+      os << "SEQUENCE_SEQ3";
+      break;
+    case Sequence::SEQUENCE_SEQ4:
+      os << "SEQUENCE_SEQ4";
+      break;
+    case Sequence::SEQUENCE_SEQ5:
+      os << "SEQUENCE_SEQ5";
+      break;
+    case Sequence::SEQUENCE_SEQ6:
+      os << "SEQUENCE_SEQ6";
+      break;
+    case Sequence::SEQUENCE_SEQ7:
+      os << "SEQUENCE_SEQ7";
+      break;
+    case Sequence::SEQUENCE_SEQ8:
+      os << "SEQUENCE_SEQ8";
+      break;
+    case Sequence::SEQUENCE_SEQ9:
+      os << "SEQUENCE_SEQ9";
+      break;
+    case Sequence::SEQUENCE_END:
+      os << "SEQUENCE_END";
+      break;
+  }
+  return os;
+}
+
+IoSequence::IoSequence(std::pair<int,int> obj_size_range,
+                                           int seed) :
+        min_obj_size(obj_size_range.first), max_obj_size(obj_size_range.second),
+        create(true), barrier(false), done(false), remove(false),
+        obj_size(min_obj_size), step(-1), seed(seed)
+{
+  rng.seed(seed);
+}
+
+std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s,
+                                                          std::pair<int,int> obj_size_range,
+                                                          int seed)
+{
+  switch (s) {
+    case Sequence::SEQUENCE_SEQ0:
+      return std::make_unique<Seq0>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ1:
+      return std::make_unique<Seq1>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ2:
+      return std::make_unique<Seq2>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ3:
+      return std::make_unique<Seq3>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ4:
+      return std::make_unique<Seq4>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ5:
+      return std::make_unique<Seq5>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ6:
+      return std::make_unique<Seq6>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ7:
+      return std::make_unique<Seq7>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ8:
+      return std::make_unique<Seq8>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ9:
+      return std::make_unique<Seq9>(obj_size_range, seed);
+    default:
+      break;
+  }
+  return nullptr;
+}
+
+int IoSequence::get_step() const
+{
+  return step;
+}
+
+int IoSequence::get_seed() const
+{
+  return seed;
+}
+
+void IoSequence::set_min_object_size(uint64_t size)
+{
+  min_obj_size = size;
+  if (obj_size < size) {
+    obj_size = size;
+    if (obj_size > max_obj_size) {
+      done = true;
+    }
+  }
+}
+
+void IoSequence::set_max_object_size(uint64_t size)
+{
+  max_obj_size = size;
+  if (obj_size > size) {
+    done = true;
+  }
+}
+
+void IoSequence::select_random_object_size()
+{
+  if (max_obj_size != min_obj_size) {
+    obj_size = min_obj_size + rng(max_obj_size - min_obj_size);
+  }
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size()
+{
+  obj_size++;
+  if (obj_size > max_obj_size) {
+    done = true;
+  }
+  create = true;
+  barrier = true;
+  remove = true;
+  return IoOp::generate_barrier();
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::next()
+{
+  step++;
+  if (remove) {
+    remove = false;
+    return IoOp::generate_remove();
+  }
+  if (barrier) {
+    barrier = false;
+    return IoOp::generate_barrier();
+  }
+  if (done) {
+    return IoOp::generate_done();
+  }
+  if (create) {
+    create = false;
+    barrier = true;
+    return IoOp::generate_create(obj_size);
+  }
+  return _next();
+}
+
+
+
+ceph::io_exerciser::Seq0::Seq0(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0)
+{
+  select_random_object_size();
+  length = 1 + rng(obj_size - 1);
+}
+
+std::string ceph::io_exerciser::Seq0::get_name() const
+{
+  return "Sequential reads of length " + std::to_string(length) +
+    " with queue depth 1 (seqseed " + std::to_string(get_seed()) + ")";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next()
+{
+  std::unique_ptr<IoOp> r;
+  if (offset >= obj_size) {
+    done = true;
+    barrier = true;
+    remove = true;
+    return IoOp::generate_barrier();
+  }
+  if (offset + length > obj_size) {
+    r = IoOp::generate_read(offset, obj_size - offset);
+  } else {
+    r = IoOp::generate_read(offset, length);
+  }
+  offset += length;
+  return r;
+}
+
+
+
+ceph::io_exerciser::Seq1::Seq1(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed)
+{
+  select_random_object_size();
+  count = 3 * obj_size;
+}
+
+std::string ceph::io_exerciser::Seq1::get_name() const
+{
+  return "Random offset, random length read/write I/O with queue depth 1 (seqseed "
+    + std::to_string(get_seed()) + ")";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next()
+{
+  barrier = true;
+  if (count-- == 0) {
+    done = true;
+    remove = true;
+    return IoOp::generate_barrier();
+  }
+
+  uint64_t offset = rng(obj_size - 1);
+  uint64_t length = 1 + rng(obj_size - 1 - offset);
+  return (rng(2) != 0) ? IoOp::generate_write(offset, length) :
+    IoOp::generate_read(offset, length);
+}
+
+
+
+ceph::io_exerciser::Seq2::Seq2(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(0) {}
+
+std::string ceph::io_exerciser::Seq2::get_name() const
+{
+  return "Permutations of offset and length read I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next()
+{
+  length++;
+  if (length > obj_size - offset) {
+    length = 1;
+    offset++;
+    if (offset >= obj_size) {
+      offset = 0;
+      length = 0;
+      return increment_object_size();
+    }
+  }
+  return IoOp::generate_read(offset, length);
+}
+
+
+
+ceph::io_exerciser::Seq3::Seq3(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset1(0), offset2(0)
+{
+  set_min_object_size(2);
+}
+
+std::string ceph::io_exerciser::Seq3::get_name() const
+{
+  return "Permutations of offset 2-region 1-block read I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next()
+{
+  offset2++;
+  if (offset2 >= obj_size - offset1) {
+    offset2 = 1;
+    offset1++;
+    if (offset1 + 1 >= obj_size) {
+      offset1 = 0;
+      offset2 = 0;
+      return increment_object_size();
+    }
+  }
+  return IoOp::generate_read2(offset1, 1, offset1 + offset2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq4::Seq4(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset1(0), offset2(1)
+{
+  set_min_object_size(3);
+}
+
+std::string ceph::io_exerciser::Seq4::get_name() const
+{
+  return "Permutations of offset 3-region 1-block read I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next()
+{
+  offset2++;
+  if (offset2 >= obj_size - offset1) {
+    offset2 = 2;
+    offset1++;
+    if (offset1 + 2 >= obj_size) {
+      offset1 = 0;
+      offset2 = 1;
+      return increment_object_size();
+    }
+  }
+  return IoOp::generate_read3(offset1, 1,
+                              offset1 + offset2, 1,
+                              (offset1 * 2 + offset2)/2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq5::Seq5(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(1),
+  doneread(false), donebarrier(false) {}
+
+std::string ceph::io_exerciser::Seq5::get_name() const
+{
+  return "Permutation of length sequential writes";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next()
+{
+  if (offset >= obj_size) {
+    if (!doneread) {
+      if (!donebarrier) {
+        donebarrier = true;
+        return IoOp::generate_barrier();
+      }
+      doneread = true;
+      barrier = true;
+      return IoOp::generate_read(0, obj_size);
+    }
+    doneread = false;
+    donebarrier = false;
+    offset = 0;
+    length++;
+    if (length > obj_size) {
+      length = 1;
+      return increment_object_size();
+    }
+  }
+  uint64_t io_len = (offset + length > obj_size) ? (obj_size - offset) : length;
+  std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len);
+  offset += io_len;
+  return r;
+}
+
+
+
+ceph::io_exerciser::Seq6::Seq6(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(1),
+  doneread(false), donebarrier(false) {}
+
+std::string ceph::io_exerciser::Seq6::get_name() const
+{
+  return "Permutation of length sequential writes, different alignment";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next()
+{
+  if (offset >= obj_size) {
+    if (!doneread) {
+      if (!donebarrier) {
+        donebarrier = true;
+        return IoOp::generate_barrier();
+      }
+      doneread = true;
+      barrier = true;
+      return IoOp::generate_read(0, obj_size);
+    }
+    doneread = false;
+    donebarrier = false;
+    offset = 0;
+    length++;
+    if (length > obj_size) {
+      length = 1;
+      return increment_object_size();
+    }
+  }
+  uint64_t io_len = (offset == 0) ? (obj_size % length) : length;
+  if (io_len == 0) {
+    io_len = length;
+  }
+  std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len);
+  offset += io_len;
+  return r;
+}
+
+
+
+ceph::io_exerciser::Seq7::Seq7(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed)
+{
+  set_min_object_size(2);
+  offset = obj_size;
+}
+
+std::string ceph::io_exerciser::Seq7::get_name() const
+{
+  return "Permutations of offset 2-region 1-block writes";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next()
+{
+  if (!doneread) {
+    if (!donebarrier) {
+      donebarrier = true;
+      return IoOp::generate_barrier();
+    }
+    doneread = true;
+    barrier = true;
+    return IoOp::generate_read(0, obj_size);
+  }
+  if (offset == 0) {
+    doneread = false;
+    donebarrier = false;
+    offset = obj_size+1;
+    return increment_object_size();
+  }
+  offset--;
+  if (offset == obj_size/2) {
+    return _next();
+  }
+  doneread = false;
+  donebarrier = false;
+  return IoOp::generate_write2(offset, 1, obj_size/2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq8::Seq8(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset1(0), offset2(1)
+{
+  set_min_object_size(3);
+}
+
+std::string ceph::io_exerciser::Seq8::get_name() const
+{
+  return "Permutations of offset 3-region 1-block write I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next()
+{
+  if (!doneread) {
+    if (!donebarrier) {
+      donebarrier = true;
+      return IoOp::generate_barrier();
+    }
+    doneread = true;
+    barrier = true;
+    return IoOp::generate_read(0, obj_size);
+  }
+  offset2++;
+  if (offset2 >= obj_size - offset1) {
+    offset2 = 2;
+    offset1++;
+    if (offset1 + 2 >= obj_size) {
+      offset1 = 0;
+      offset2 = 1;
+      return increment_object_size();
+    }
+  }
+  doneread = false;
+  donebarrier = false;
+  return IoOp::generate_write3(offset1, 1,
+                              offset1 + offset2, 1,
+                              (offset1 * 2 + offset2)/2, 1);
+}
+
+
+
+ceph::io_exerciser::Seq9::Seq9(std::pair<int,int> obj_size_range, int seed) :
+  IoSequence(obj_size_range, seed), offset(0), length(0)
+{
+  
+}
+
+std::string ceph::io_exerciser::Seq9::get_name() const
+{
+  return "Permutations of offset and length write I/O";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next()
+{
+  if (!doneread) {
+    if (!donebarrier) {
+      donebarrier = true;
+      return IoOp::generate_barrier();
+    }
+    doneread = true;
+    barrier = true;
+    return IoOp::generate_read(0, obj_size);
+  }
+  length++;
+  if (length > obj_size - offset) {
+    length = 1;
+    offset++;
+    if (offset >= obj_size) {
+      offset = 0;
+      length = 0;
+      return increment_object_size();
+    }
+  }
+  doneread = false;
+  donebarrier = false;
+  return IoOp::generate_write(offset, length);
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoSequence.h b/src/common/io_exerciser/IoSequence.h
new file mode 100644
index 00000000000..114ff76303f
--- /dev/null
+++ b/src/common/io_exerciser/IoSequence.h
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "IoOp.h"
+
+#include "include/random.h"
+
+/* Overview
+ *
+ * enum Sequence
+ *   Enumeration of the different sequences
+ *
+ * class IoSequence
+ *   Virtual class. IoSequences generate a stream of IoOPs.
+ *   Sequences typically exhastively test permutations of
+ *   offset and length to allow validation of code such as
+ *   Erasure Coding. An IoSequence does not determine
+ *   whether I/Os are issued sequentially or in parallel,
+ *   it must generate barrier I/Os where operations must
+ *   be serialized.
+ *
+ * class Seq*
+ *   Implementations of IoSequence. Each class generates
+ *   a different sequence of I/O.
+ *
+ * generate_sequence
+ *   Create an IoSequence
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+
+    enum class Sequence {
+      SEQUENCE_SEQ0,
+      SEQUENCE_SEQ1,
+      SEQUENCE_SEQ2,
+      SEQUENCE_SEQ3,
+      SEQUENCE_SEQ4,
+      SEQUENCE_SEQ5,
+      SEQUENCE_SEQ6,
+      SEQUENCE_SEQ7,
+      SEQUENCE_SEQ8,
+      SEQUENCE_SEQ9,
+      //
+      SEQUENCE_END,
+      SEQUENCE_BEGIN = SEQUENCE_SEQ0
+    };
+
+    inline Sequence operator++( Sequence& s )
+    {
+      return s = (Sequence)(((int)(s) + 1));
+    }
+
+    std::ostream& operator<<(std::ostream& os, const Sequence& seq);
+
+    /* I/O Sequences */
+
+    class IoSequence {
+    public:
+      virtual ~IoSequence() = default;
+
+      virtual std::string get_name() const = 0;
+      int get_step() const;
+      int get_seed() const;
+
+      std::unique_ptr<IoOp> next();
+
+      static std::unique_ptr<IoSequence>
+        generate_sequence(Sequence s, std::pair<int,int> obj_size_range, int seed );
+
+    protected:
+      uint64_t min_obj_size;
+      uint64_t max_obj_size;
+      bool create;
+      bool barrier;
+      bool done;
+      bool remove;
+      uint64_t obj_size;
+      int step;
+      int seed;
+      ceph::util::random_number_generator<int> rng =
+        ceph::util::random_number_generator<int>();
+
+      IoSequence(std::pair<int,int> obj_size_range, int seed);
+
+      virtual std::unique_ptr<IoOp> _next() = 0;
+
+      void set_min_object_size(uint64_t size);
+      void set_max_object_size(uint64_t size);
+      void select_random_object_size();
+      std::unique_ptr<IoOp> increment_object_size();
+
+    };
+
+    class Seq0: public IoSequence {
+    public:
+      Seq0(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      uint64_t length;
+    };
+
+    class Seq1: public IoSequence {  
+    public:
+      Seq1(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next();
+
+    private:
+      int count;
+    };
+      
+    class Seq2: public IoSequence {
+    public:
+      Seq2(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+    
+    private:
+      uint64_t offset;
+      uint64_t length;
+    };
+
+    class Seq3: public IoSequence {
+    public:
+      Seq3(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+    private:
+      uint64_t offset1;
+      uint64_t offset2;
+    };
+
+    class Seq4: public IoSequence {
+    public:
+      Seq4(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset1;
+      uint64_t offset2;
+    };
+
+    class Seq5: public IoSequence {
+    public:
+      Seq5(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      uint64_t length;
+      bool doneread;
+      bool donebarrier;
+    };
+
+    class Seq6: public IoSequence {
+    public:
+      Seq6(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      uint64_t length;
+      bool doneread;
+      bool donebarrier;
+    };
+
+    class Seq7: public IoSequence {
+    public:
+      Seq7(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+
+    private:
+      uint64_t offset;
+      bool doneread = true;
+      bool donebarrier = false;
+    };
+
+    class Seq8: public IoSequence {
+    public:
+      Seq8(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+      std::unique_ptr<IoOp> _next() override;
+    private:
+      uint64_t offset1;
+      uint64_t offset2;
+      bool doneread = true;
+      bool donebarrier = false;
+    };
+
+    class Seq9: public IoSequence {
+    private:
+      uint64_t offset;
+      uint64_t length;
+      bool doneread = true;
+      bool donebarrier = false;
+
+    public:
+      Seq9(std::pair<int,int> obj_size_range, int seed);
+
+      std::string get_name() const override;
+
+      std::unique_ptr<IoOp> _next() override;
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/Model.cc b/src/common/io_exerciser/Model.cc
new file mode 100644
index 00000000000..50812ecbb15
--- /dev/null
+++ b/src/common/io_exerciser/Model.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "Model.h"
+
+using Model = ceph::io_exerciser::Model;
+
+Model::Model(const std::string& oid, uint64_t block_size) : 
+num_io(0),
+oid(oid),
+block_size(block_size)
+{
+
+}
+
+const uint64_t Model::get_block_size() const
+{
+  return block_size;
+}
+
+const std::string Model::get_oid() const
+{
+  return oid;
+}
+
+int Model::get_num_io() const
+{
+  return num_io;
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/Model.h b/src/common/io_exerciser/Model.h
new file mode 100644
index 00000000000..58d107409a6
--- /dev/null
+++ b/src/common/io_exerciser/Model.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "IoOp.h"
+
+#include <boost/asio/io_context.hpp>
+
+#include "librados/librados_asio.h"
+
+#include "include/interval_set.h"
+#include "global/global_init.h"
+#include "global/global_context.h"
+#include "common/Thread.h"
+
+/* Overview
+ *
+ * class Model
+ *   Virtual class. Models apply IoOps generated by an
+ *   IoSequence, they can choose how many I/Os to execute in
+ *   parallel and scale up the size of I/Os by the blocksize
+ *
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+
+    class Model
+    {
+    protected:
+      int num_io{0};
+      std::string oid;
+      uint64_t block_size;
+
+    public:
+      Model(const std::string& oid, uint64_t block_size);
+      virtual ~Model() = default;
+
+      virtual bool readyForIoOp(IoOp& op) = 0;
+      virtual void applyIoOp(IoOp& op) = 0;
+      
+      const std::string get_oid() const;
+      const uint64_t get_block_size() const;
+      int get_num_io() const;
+    };
+
+    /* Simple RADOS I/O generator */
+
+    
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/ObjectModel.cc b/src/common/io_exerciser/ObjectModel.cc
new file mode 100644
index 00000000000..589f6434282
--- /dev/null
+++ b/src/common/io_exerciser/ObjectModel.cc
@@ -0,0 +1,174 @@
+#include "ObjectModel.h"
+
+#include <algorithm>
+#include <execution>
+#include <iterator>
+
+using ObjectModel = ceph::io_exerciser::ObjectModel;
+
+ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) :
+  Model(oid, block_size), created(false)
+{
+  rng.seed(seed);
+}
+
+int ObjectModel::get_seed(uint64_t offset) const
+{
+  ceph_assert(offset < contents.size());
+  return contents[offset];
+}
+
+std::vector<int> ObjectModel::get_seed_offsets(int seed) const
+{
+  std::vector<int> offsets;
+  for (size_t i = 0; i < contents.size(); i++)
+  {
+    if (contents[i] == seed)
+    {
+      offsets.push_back(i);
+    }
+  }
+
+  return offsets;
+}
+
+std::string ObjectModel::to_string(int mask) const
+{
+  if (!created) {
+    return "Object does not exist";
+  }
+  std::string result = "{";
+  for (uint64_t i = 0; i < contents.size(); i++) {
+    if (i != 0) {
+      result += ",";
+    }
+    result += std::to_string(contents[i] & mask);
+  }
+  result += "}";
+  return result;
+}
+
+bool ObjectModel::readyForIoOp(IoOp& op)
+{
+  return true;
+}
+
+void ObjectModel::applyIoOp(IoOp& op)
+{
+  auto generate_random = [&rng = rng]() {
+    return rng();
+  };
+
+  switch (op.op) {
+  case OpType::BARRIER:
+    reads.clear();
+    writes.clear();
+    break;
+
+  case OpType::CREATE:
+    ceph_assert(!created);
+    ceph_assert(reads.empty());
+    ceph_assert(writes.empty());
+    created = true;
+    contents.resize(op.length1);
+    std::generate(std::execution::seq, contents.begin(), contents.end(),
+                  generate_random);
+    break;
+
+  case OpType::REMOVE:
+    ceph_assert(created);
+    ceph_assert(reads.empty());
+    ceph_assert(writes.empty());
+    created = false;
+    contents.resize(0);
+    break;
+
+  case OpType::READ3:
+    ceph_assert(created);
+    ceph_assert(op.offset3 + op.length3 <= contents.size());
+    // Not allowed: read overlapping with parallel write
+    ceph_assert(!writes.intersects(op.offset3, op.length3));
+    reads.union_insert(op.offset3, op.length3);
+    [[fallthrough]];
+
+  case OpType::READ2:
+    ceph_assert(created);
+    ceph_assert(op.offset2 + op.length2 <= contents.size());
+    // Not allowed: read overlapping with parallel write
+    ceph_assert(!writes.intersects(op.offset2, op.length2));
+    reads.union_insert(op.offset2, op.length2);
+    [[fallthrough]];
+
+  case OpType::READ:
+    ceph_assert(created);
+    ceph_assert(op.offset1 + op.length1 <= contents.size());
+    // Not allowed: read overlapping with parallel write
+    ceph_assert(!writes.intersects(op.offset1, op.length1));
+    reads.union_insert(op.offset1, op.length1);
+    num_io++;
+    break;
+
+  case OpType::WRITE3:
+    ceph_assert(created);
+    // Not allowed: write overlapping with parallel read or write
+    ceph_assert(!reads.intersects(op.offset3, op.length3));
+    ceph_assert(!writes.intersects(op.offset3, op.length3));
+    writes.union_insert(op.offset3, op.length3);
+    ceph_assert(op.offset3 + op.length3 <= contents.size());
+    std::generate(std::execution::seq,
+                  std::next(contents.begin(), op.offset3),
+                  std::next(contents.begin(), op.offset3 + op.length3),
+                  generate_random);
+    [[fallthrough]];
+
+  case OpType::WRITE2:
+    ceph_assert(created);
+    // Not allowed: write overlapping with parallel read or write
+    ceph_assert(!reads.intersects(op.offset2, op.length2));
+    ceph_assert(!writes.intersects(op.offset2, op.length2));
+    writes.union_insert(op.offset2, op.length2);
+    ceph_assert(op.offset2 + op.length2 <= contents.size());
+    std::generate(std::execution::seq,
+                  std::next(contents.begin(), op.offset2),
+                  std::next(contents.begin(), op.offset2 + op.length2),
+                  generate_random);
+    [[fallthrough]];
+
+  case OpType::WRITE:
+    ceph_assert(created);
+    // Not allowed: write overlapping with parallel read or write
+    ceph_assert(!reads.intersects(op.offset1, op.length1));
+    ceph_assert(!writes.intersects(op.offset1, op.length1));
+    writes.union_insert(op.offset1, op.length1);
+    ceph_assert(op.offset1 + op.length1 <= contents.size());
+    std::generate(std::execution::seq,
+                  std::next(contents.begin(), op.offset1),
+                  std::next(contents.begin(), op.offset1 + op.length1),
+                  generate_random);
+    num_io++;
+    break;
+  default:
+    break;
+  }
+}
+
+void ObjectModel::encode(ceph::buffer::list& bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(created, bl);
+  if (created) {
+    encode(contents, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void ObjectModel::decode(ceph::buffer::list::const_iterator& bl) {
+  DECODE_START(1, bl);
+  DECODE_OLDEST(1);
+  decode(created, bl);
+  if (created) {
+    decode(contents, bl);
+  } else {
+    contents.resize(0);
+  }
+  DECODE_FINISH(bl);
+}
diff --git a/src/common/io_exerciser/ObjectModel.h b/src/common/io_exerciser/ObjectModel.h
new file mode 100644
index 00000000000..93c70f41429
--- /dev/null
+++ b/src/common/io_exerciser/ObjectModel.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "Model.h"
+
+/* Overview
+ *
+ * class ObjectModel
+ *   An IoExerciser. Tracks the data stored in an object, applies
+ *   IoOp's to update the model. Polices that I/Os that are
+ *   permitted to run in parallel do not break rules. Provides
+ *   interface to query state of object. State can be encoded
+ *   and decoded
+ *
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+    /* Model of an object to track its data contents */
+
+    class ObjectModel : public Model {
+    private:
+      bool created;
+      std::vector<int> contents;
+      ceph::util::random_number_generator<int> rng =
+        ceph::util::random_number_generator<int>();
+
+      // Track read and write I/Os that can be submitted in
+      // parallel to detect violations:
+      //
+      // * Read may not overlap with a parallel write
+      // * Write may not overlap with a parallel read or write
+      // * Create / remove may not be in parallel with read or write
+      //
+      // Fix broken test cases by adding barrier ops to restrict
+      // I/O exercisers from issuing conflicting ops in parallel
+      interval_set<uint64_t> reads;
+      interval_set<uint64_t> writes;
+    public:
+      ObjectModel(const std::string& oid, uint64_t block_size, int seed);
+      
+      int get_seed(uint64_t offset) const;
+      std::vector<int> get_seed_offsets(int seed) const;
+
+      std::string to_string(int mask = -1) const;
+
+      bool readyForIoOp(IoOp& op);
+      void applyIoOp(IoOp& op);
+      
+      void encode(ceph::buffer::list& bl) const;
+      void decode(ceph::buffer::list::const_iterator& bl);
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc
new file mode 100644
index 00000000000..44b82260263
--- /dev/null
+++ b/src/common/io_exerciser/RadosIo.cc
@@ -0,0 +1,300 @@
+#include "RadosIo.h"
+
+#include "DataGenerator.h"
+
+using RadosIo = ceph::io_exerciser::RadosIo;
+
+RadosIo::RadosIo(librados::Rados& rados,
+        boost::asio::io_context& asio,
+        const std::string& pool,
+        const std::string& oid,
+        uint64_t block_size,
+        int seed,
+	int threads,
+        ceph::mutex& lock,
+        ceph::condition_variable& cond) :
+  Model(oid, block_size),
+  rados(rados),
+  asio(asio),
+  om(std::make_unique<ObjectModel>(oid, block_size, seed)),
+  db(data_generation::DataGenerator::create_generator(
+      data_generation::GenerationType::HeaderedSeededRandom, *om)),
+  pool(pool),
+  threads(threads),
+  lock(lock),
+  cond(cond),
+  outstanding_io(0)
+{
+  int rc;
+  rc = rados.ioctx_create(pool.c_str(), io);
+  ceph_assert(rc == 0);
+  allow_ec_overwrites(true);
+}
+
+RadosIo::~RadosIo()
+{
+}
+
+void RadosIo::start_io()
+{
+  std::lock_guard l(lock);
+  outstanding_io++;
+}
+
+void RadosIo::finish_io()
+{
+  std::lock_guard l(lock);
+  ceph_assert(outstanding_io > 0);
+  outstanding_io--;
+  cond.notify_all();
+}
+
+void RadosIo::wait_for_io(int count)
+{
+  std::unique_lock l(lock);
+  while (outstanding_io > count) {
+    cond.wait(l);
+  }
+}
+
+void RadosIo::allow_ec_overwrites(bool allow)
+{
+  int rc;
+  bufferlist inbl, outbl;
+  std::string cmdstr =
+    "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + "\", \
+      \"var\": \"allow_ec_overwrites\", \"val\": \"" +
+    (allow ? "true" : "false") + "\"}";
+  rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
+  ceph_assert(rc == 0);
+}
+
+RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1,
+                                  uint64_t offset2, uint64_t length2,
+                                  uint64_t offset3, uint64_t length3 ) :
+  offset1(offset1), length1(length1),
+  offset2(offset2), length2(length2),
+  offset3(offset3), length3(length3)
+{
+
+}
+
+bool RadosIo::readyForIoOp(IoOp &op)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held
+  if (!om->readyForIoOp(op)) {
+    return false;
+  }
+  switch (op.op) {
+  case OpType::Done:
+  case OpType::BARRIER:
+    return outstanding_io == 0;
+  default:
+    return outstanding_io < threads;
+  }
+}
+
+void RadosIo::applyIoOp(IoOp &op)
+{
+  std::shared_ptr<AsyncOpInfo> op_info;
+
+  om->applyIoOp(op);
+
+  // If there are thread concurrent I/Os in flight then wait for
+  // at least one I/O to complete
+  wait_for_io(threads-1);
+  
+  switch (op.op) {
+  case OpType::Done:
+  [[ fallthrough ]];
+  case OpType::BARRIER:
+    // Wait for all outstanding I/O to complete
+    wait_for_io(0);
+    break;    
+
+  case OpType::CREATE:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(0, op.length1);
+      op_info->bl1 = db->generate_data(0, op.length1);
+      op_info->wop.write_full(op_info->bl1);
+      auto create_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, create_cb);
+    }
+    break;
+
+  case OpType::REMOVE:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>();
+      op_info->wop.remove();
+      auto remove_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, remove_cb);
+    }
+    break;
+
+  case OpType::READ:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1);
+      op_info->rop.read(op.offset1 * block_size,
+                        op.length1 * block_size,
+                        &op_info->bl1, nullptr);
+      auto read_cb = [this, op_info] (boost::system::error_code ec,
+                                      version_t ver,
+                                      bufferlist bl) {
+        ceph_assert(ec == boost::system::errc::success);
+        ceph_assert(db->validate(op_info->bl1,
+                                 op_info->offset1,
+                                 op_info->length1));
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->rop, 0, nullptr, read_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::READ2:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1,
+                                              op.length1,
+                                              op.offset2,
+                                              op.length2);
+
+      op_info->rop.read(op.offset1 * block_size,
+                        op.length1 * block_size,
+                        &op_info->bl1, nullptr);
+      op_info->rop.read(op.offset2 * block_size,
+                    op.length2 * block_size,
+                    &op_info->bl2, nullptr);
+      auto read2_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
+                                       bufferlist bl) {
+        ceph_assert(ec == boost::system::errc::success);
+        ceph_assert(db->validate(op_info->bl1,
+                                 op_info->offset1,
+                                 op_info->length1));
+        ceph_assert(db->validate(op_info->bl2,
+                                 op_info->offset2,
+                                 op_info->length2));
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->rop, 0, nullptr, read2_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::READ3:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
+                                              op.offset2, op.length2,
+                                              op.offset3, op.length3);
+      op_info->rop.read(op.offset1 * block_size,
+                    op.length1 * block_size,
+                    &op_info->bl1, nullptr);
+      op_info->rop.read(op.offset2 * block_size,
+                    op.length2 * block_size,
+                    &op_info->bl2, nullptr);
+      op_info->rop.read(op.offset3 * block_size,
+                    op.length3 * block_size,
+                    &op_info->bl3, nullptr);
+      auto read3_cb = [this, op_info] (boost::system::error_code ec,
+                                       version_t ver,
+                                       bufferlist bl) {
+        ceph_assert(ec == boost::system::errc::success);
+        ceph_assert(db->validate(op_info->bl1,
+                                 op_info->offset1,
+                                 op_info->length1));
+        ceph_assert(db->validate(op_info->bl2,
+                                 op_info->offset2,
+                                 op_info->length2));
+        ceph_assert(db->validate(op_info->bl3,
+                                 op_info->offset3,
+                                 op_info->length3));
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->rop, 0, nullptr, read3_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::WRITE:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1);
+      op_info->bl1 = db->generate_data(op.offset1, op.length1);
+
+      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
+      auto write_cb = [this] (boost::system::error_code ec,
+                              version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, write_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::WRITE2:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
+                                              op.offset2, op.length2);
+      op_info->bl1 = db->generate_data(op.offset1, op.length1);
+      op_info->bl2 = db->generate_data(op.offset2, op.length2);
+      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
+      op_info->wop.write(op.offset2 * block_size, op_info->bl2);
+      auto write2_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, write2_cb);
+      num_io++;
+    }
+    break;
+
+  case OpType::WRITE3:
+    {
+      start_io();
+      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
+                                              op.offset2, op.length2,
+                                              op.offset3, op.length3);
+      op_info->bl1 = db->generate_data(op.offset1, op.length1);
+      op_info->bl2 = db->generate_data(op.offset2, op.length2);
+      op_info->bl3 = db->generate_data(op.offset3, op.length3);
+      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
+      op_info->wop.write(op.offset2 * block_size, op_info->bl2);
+      op_info->wop.write(op.offset3 * block_size, op_info->bl3);
+      auto write3_cb = [this] (boost::system::error_code ec,
+                               version_t ver) {
+        ceph_assert(ec == boost::system::errc::success);
+        finish_io();
+      };
+      librados::async_operate(asio, io, oid,
+                              &op_info->wop, 0, nullptr, write3_cb);
+      num_io++;
+    }
+    break;
+
+  default:
+    break;
+  }
+}
diff --git a/src/common/io_exerciser/RadosIo.h b/src/common/io_exerciser/RadosIo.h
new file mode 100644
index 00000000000..179c5bba3ae
--- /dev/null
+++ b/src/common/io_exerciser/RadosIo.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "ObjectModel.h"
+
+/* Overview
+ *
+ * class RadosIo
+ *   An IoExerciser. A simple RADOS client that generates I/Os
+ *   from IoOps. Uses an ObjectModel to track the data stored
+ *   in the object. Uses DataBuffer to create and validate
+ *   data buffers. When there are not barrier I/Os this may
+ *   issue multiple async I/Os in parallel.
+ * 
+ */
+
+namespace ceph {
+  namespace io_exerciser {
+    namespace data_generation {
+      class DataGenerator;
+    }
+    
+    class RadosIo: public Model {
+    protected:
+      librados::Rados& rados;
+      boost::asio::io_context& asio;
+      std::unique_ptr<ObjectModel> om;
+      std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db;
+      std::string pool;
+      int threads;
+      ceph::mutex& lock;
+      ceph::condition_variable& cond;
+      librados::IoCtx io;
+      int outstanding_io;
+
+      void start_io();
+      void finish_io();
+      void wait_for_io(int count);
+      
+    public:
+      RadosIo(librados::Rados& rados,
+              boost::asio::io_context& asio,
+              const std::string& pool,
+              const std::string& oid,
+              uint64_t block_size,
+              int seed,
+              int threads,
+              ceph::mutex& lock,
+              ceph::condition_variable& cond);
+
+      ~RadosIo();
+
+      void allow_ec_overwrites(bool allow);
+
+      class AsyncOpInfo {
+      public:
+        librados::ObjectReadOperation rop;
+        librados::ObjectWriteOperation wop;
+        ceph::buffer::list bl1;
+        ceph::buffer::list bl2;
+        ceph::buffer::list bl3;
+        uint64_t offset1;
+        uint64_t length1;
+        uint64_t offset2;
+        uint64_t length2;
+        uint64_t offset3;
+        uint64_t length3;
+
+        AsyncOpInfo(uint64_t offset1 = 0, uint64_t length1 = 0,
+                uint64_t offset2 = 0, uint64_t length2 = 0,
+                uint64_t offset3 = 0, uint64_t length3 = 0 );
+        ~AsyncOpInfo() = default;
+      };
+
+      // Must be called with lock held
+      bool readyForIoOp(IoOp& op);
+      
+      void applyIoOp(IoOp& op);
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp
index 4d843be75dc..95353425de9 100644
--- a/src/common/map_cacher.hpp
+++ b/src/common/map_cacher.hpp
@@ -16,6 +16,7 @@
 #define MAPCACHER_H
 
 #include "include/Context.h"
+#include "include/expected.hpp"
 #include "common/sharedptr_registry.hpp"
 
 namespace MapCacher {
@@ -130,6 +131,50 @@ public:
     return -EINVAL;
   } ///< @return error value, 0 on success, -ENOENT if no more entries
 
+  /// Fetch first key/value std::pair after specified key
+  struct PosAndData {
+    K last_key;
+    V data;
+  };
+  using MaybePosAndData = tl::expected<PosAndData, int>;
+
+  MaybePosAndData get_1st_after_key(
+      K key  ///< [in] key after which to get next
+  )
+  {
+    ceph_assert(driver);
+    while (true) {
+      std::pair<K, boost::optional<V>> cached;
+      bool got_cached = in_progress.get_next(key, &cached);
+
+      ///\todo a driver->get_next() that returns an expected<K, V> would be nice
+      bool got_store{false};
+      std::pair<K, V> store;
+      int r = driver->get_next(key, &store);
+      if (r < 0 && r != -ENOENT) {
+        return tl::unexpected(r);
+      } else if (r == 0) {
+	got_store = true;
+      }
+
+      if (!got_cached && !got_store) {
+        return tl::unexpected(-ENOENT);
+      } else if (got_cached && (!got_store || store.first >= cached.first)) {
+	if (cached.second) {
+	  return PosAndData{cached.first, *cached.second};
+	} else {
+	  key = cached.first;
+	  continue;  // value was cached as removed, recurse
+	}
+      } else {
+	return PosAndData{store.first, store.second};
+      }
+    }
+    ceph_abort();  // not reachable
+    return tl::unexpected(-EINVAL);
+  }
+
+
   /// Adds operation setting keys to Transaction
   void set_keys(
     const std::map<K, V> &keys,  ///< [in] keys/values to std::set
diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h
index c1a4ff2a435..d56d0ebee99 100644
--- a/src/common/mutex_debug.h
+++ b/src/common/mutex_debug.h
@@ -169,20 +169,16 @@ public:
   }
 
   bool try_lock(bool no_lockdep = false) {
-    bool locked = try_lock_impl();
-    if (locked) {
-      if (enable_lockdep(no_lockdep))
-	_locked();
-      _post_lock();
-    }
-    return locked;
+    ceph_assert(recursive || !is_locked_by_me());
+    return _try_lock(no_lockdep);
   }
 
   void lock(bool no_lockdep = false) {
+    ceph_assert(recursive || !is_locked_by_me());
     if (enable_lockdep(no_lockdep))
       _will_lock(recursive);
 
-    if (try_lock(no_lockdep))
+    if (_try_lock(no_lockdep))
       return;
 
     lock_impl();
@@ -198,6 +194,16 @@ public:
     unlock_impl();
   }
 
+private:
+  bool _try_lock(bool no_lockdep) {
+    bool locked = try_lock_impl();
+    if (locked) {
+      if (enable_lockdep(no_lockdep))
+	_locked();
+      _post_lock();
+    }
+    return locked;
+  }
 };
 
 
diff --git a/src/common/options.h b/src/common/options.h
index ad39936d43a..abded4cc0dd 100644
--- a/src/common/options.h
+++ b/src/common/options.h
@@ -207,8 +207,8 @@ struct Option {
   typedef std::function<int(std::string *, std::string *)> validator_fn_t;
   validator_fn_t validator;
 
-  Option(std::string const &name, type_t t, level_t l)
-    : name(name), type(t), level(l)
+  Option(std::string &&name, type_t t, level_t l)
+    : name(std::move(name)), type(t), level(l)
   {
     // While value_t is nullable (via std::monostate), we don't ever
     // want it set that way in an Option instance: within an instance,
diff --git a/src/common/options/mgr.yaml.in b/src/common/options/mgr.yaml.in
index f29182930b2..773b0d36591 100644
--- a/src/common/options/mgr.yaml.in
+++ b/src/common/options/mgr.yaml.in
@@ -292,6 +292,15 @@ options:
   default: true
   services:
   - mgr
+- name: mon_warn_on_pool_no_app_grace
+  type: secs
+  level: dev
+  desc: time after which POOL_APP_NOT_ENABLED health warning is issued
+  default: 5_min
+  services:
+  - mgr
+  see_also:
+  - mon_warn_on_pool_no_app
 - name: mon_warn_on_too_few_osds
   type: bool
   level: advanced
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
index e12061cf93c..6bfb760d4d3 100644
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -212,11 +212,8 @@ options:
   long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
   fmt_desc: This restricts scrubbing to this hour of the day or later.
     Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
-    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
-    window, in which the scrubs can happen.
-    But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
+    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour`` they define a time
+    window, only in which will periodic scrubs be initiated.
   default: 0
   see_also:
   - osd_scrub_end_hour
@@ -228,12 +225,10 @@ options:
   level: advanced
   desc: Restrict scrubbing to hours of the day earlier than this
   long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
-  fmt_desc: This restricts scrubbing to the hour earlier than this.
+  fmt_desc: This restricts scrubbing to the hours earlier than this.
     Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
     for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
-    window, in which the scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
+    window, only in which can periodic scrubs be automatically initiated.
   default: 0
   see_also:
   - osd_scrub_begin_hour
@@ -250,9 +245,7 @@ options:
     0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
     and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
     Along with ``osd_scrub_end_week_day``, they define a time window in which
-    scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, when the PG's
-    scrub interval exceeds ``osd_scrub_max_interval``.
+    periodic scrubs can be automatically initiated.
   default: 0
   see_also:
   - osd_scrub_end_week_day
@@ -269,9 +262,7 @@ options:
     0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
     and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
     Along with ``osd_scrub_begin_week_day``, they define a time
-    window, in which the scrubs can happen. But a scrub will be performed
-    no matter whether the time window allows or not, as long as the placement
-    group's scrub interval exceeds ``osd_scrub_max_interval``.
+    window, in which periodic scrubs can be automatically initiated.
   default: 0
   see_also:
   - osd_scrub_begin_week_day
@@ -282,8 +273,9 @@ options:
   type: float
   level: advanced
   desc: Allow scrubbing when system load divided by number of CPUs is below this value
-  fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
-    (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
+  fmt_desc: The normalized maximum load. Ceph will not initiate periodic (regular)
+    scrubs when the system load (as defined by ``getloadavg() / number of online CPUs``)
+    is higher than this number.
     Default is ``0.5``.
   default: 0.5
   with_legacy: true
@@ -292,8 +284,7 @@ options:
   type: float
   level: advanced
   desc: The desired interval between scrubs of a specific PG.
-  fmt_desc: The desired interval in seconds between scrubs of a specific PG
-    when the Ceph Storage Cluster load is low.
+  fmt_desc: The desired interval in seconds between scrubs of a specific PG.
   default: 1_day
   see_also:
   - osd_scrub_max_interval
@@ -303,8 +294,7 @@ options:
   type: float
   level: advanced
   desc: Scrub each PG no less often than this interval
-  fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
-    irrespective of cluster load.
+  fmt_desc: The maximum interval in seconds for scrubbing each PG.
   default: 7_day
   see_also:
   - osd_scrub_min_interval
@@ -315,7 +305,7 @@ options:
   level: advanced
   desc: Ratio of scrub interval to randomly vary
   long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
-    so that they are soon uniformly distributed over the week
+    so that they are uniformly distributed over time.
   fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
     the next scrub job for a PG. The delay is a random
     value less than ``osd_scrub_min_interval`` \*
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
index f3d242b1fe3..0ce5bc332fd 100644
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -59,6 +59,14 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_parquet_buffer_size
+  type: size
+  level: advanced
+  desc: the Maximum parquet buffer size, a limit to memory consumption for parquet reading operations.
+  default: 16_M
+  services:
+  - rgw
+  with_legacy: true
 - name: rgw_rados_tracing
   type: bool
   level: advanced
@@ -448,6 +456,19 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_restore_debug_interval
+  type: int
+  level: dev
+  desc: The number of seconds that simulate one "day" in order to debug RGW CloudRestore.
+    Do *not* modify for a production cluster.
+  long_desc: For debugging RGW Cloud Restore, the number of seconds that are equivalent to
+    one simulated "day". Values less than 1 are ignored and do not change Restore behavior.
+    For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
+    then this would be set to 600, the number of seconds in 10 minutes.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
 - name: rgw_mp_lock_max_time
   type: int
   level: advanced
@@ -1865,6 +1886,18 @@ options:
   services:
   - rgw
   with_legacy: true
+- name: rgw_graceful_stop
+  type: bool
+  level: advanced
+  desc: Delay the shutdown until all outstanding requests have completed
+  long_desc: Wait for up to `rgw_exit_timeout_secs` for all outstanding requests to complete
+    before exiting unconditionally. (new HTTP requests will not be accepted during this time.)
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_exit_timeout_secs
+  with_legacy: true
 - name: rgw_get_obj_window_size
   type: size
   level: advanced
@@ -2066,14 +2099,6 @@ options:
   services:
   - rgw
   with_legacy: true
-- name: rgw_data_log_obj_prefix
-  type: str
-  level: dev
-  default: data_log
-  fmt_desc: The object name prefix for the data log.
-  services:
-  - rgw
-  with_legacy: true
 - name: rgw_data_sync_poll_interval
   type: int
   level: dev
diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc
index b03a3cab70c..4b4d191e09c 100644
--- a/src/common/scrub_types.cc
+++ b/src/common/scrub_types.cc
@@ -161,6 +161,13 @@ void inconsistent_obj_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_obj_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
@@ -240,6 +247,13 @@ void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
+bufferlist inconsistent_snapset_wrapper::encode() const
+{
+  bufferlist bl;
+  encode(bl);
+  return bl;
+}
+
 void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp)
 {
   DECODE_START(2, bp);
diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h
index dd206f56f60..d86fc12b6c8 100644
--- a/src/common/scrub_types.h
+++ b/src/common/scrub_types.h
@@ -152,6 +152,7 @@ struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
 			const pg_shard_t &primary);
   void set_version(uint64_t ver) { version = ver; }
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
@@ -181,6 +182,7 @@ struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t {
   void set_size_mismatch();
 
   void encode(ceph::buffer::list& bl) const;
+  ceph::buffer::list encode() const;
   void decode(ceph::buffer::list::const_iterator& bp);
 };
 
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
index c9e982b6396..c97942adec5 100644
--- a/src/common/strtol.cc
+++ b/src/common/strtol.cc
@@ -146,43 +146,54 @@ T strict_iec_cast(std::string_view str, std::string *err)
   if (u != std::string_view::npos) {
     n = str.substr(0, u);
     unit = str.substr(u, str.length() - u);
+    // handling cases when prefixes entered as KB, MB, ...
+    // and KiB, MiB, ....
+    if (unit.length() > 1 && unit.back() == 'B') {
+      unit = unit.substr(0, unit.length() - 1);
+    }
     // we accept both old si prefixes as well as the proper iec prefixes
     // i.e. K, M, ... and Ki, Mi, ...
-    if (unit.back() == 'i') {
-      if (unit.front() == 'B') {
-        *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
-        return 0;
-      }
-    }
     if (unit.length() > 2) {
       *err = "strict_iecstrtoll: illegal prefix (length > 2)";
       return 0;
     }
-    switch(unit.front()) {
-      case 'K':
-        m = 10;
-        break;
-      case 'M':
-        m = 20;
-        break;
-      case 'G':
-        m = 30;
-        break;
-      case 'T':
-        m = 40;
-        break;
-      case 'P':
-        m = 50;
-        break;
-      case 'E':
-        m = 60;
-        break;
-      case 'B':
-        break;
-      default:
-        *err = "strict_iecstrtoll: unit prefix not recognized";
-        return 0;
+    if ((unit.back() == 'i') || (unit.length() == 1)) {
+      if (unit.back() == 'i') {
+        if (unit.front() == 'B') {
+          *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
+          return 0;
+        }
+      }
+      switch(unit.front()) {
+        case 'K':
+          m = 10;
+          break;
+        case 'M':
+          m = 20;
+          break;
+        case 'G':
+          m = 30;
+          break;
+        case 'T':
+          m = 40;
+          break;
+        case 'P':
+          m = 50;
+          break;
+        case 'E':
+          m = 60;
+          break;
+        case 'B':
+          break;
+        default:
+          *err = ("strict_iecstrtoll: unit prefix not recognized '" + std::string{unit} + "' ");
+          return 0;
+      }
     }
+    else {
+      *err = ("strict_iecstrtoll: illegal prefix '" + std::string{unit} + "' ");
+      return 0;
+    }   
   }
 
   long long ll = strict_strtoll(n, 10, err);
diff --git a/src/crimson/common/log.h b/src/crimson/common/log.h
index 4f564ac044d..c38b225c94b 100644
--- a/src/crimson/common/log.h
+++ b/src/crimson/common/log.h
@@ -90,7 +90,7 @@ static inline seastar::log_level to_log_level(int level) {
 #define SUBLOGDPP(subname_, level_, MSG, dpp, ...) \
   LOGGER(subname_).log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
 #define SUBLOGDPPI(subname_, level_, MSG, dpp, ...) \
-  LOGGER(subname_).log(level_, "{} {}: " MSG, \
+  LOGGER(subname_).log(level_, "{} {} {}: " MSG,			\
   interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__)
 #define SUBTRACEDPP(subname_, ...) SUBLOGDPP(subname_, seastar::log_level::trace, __VA_ARGS__)
 #define SUBTRACEDPPI(subname_, ...) SUBLOGDPPI(subname_, seastar::log_level::trace, __VA_ARGS__)
@@ -106,7 +106,7 @@ static inline seastar::log_level to_log_level(int level) {
 #define LOGDPP(level_, MSG, dpp, ...) \
   LOCAL_LOGGER.log(level_, "{} {}: " MSG, dpp, FNAME , ##__VA_ARGS__)
 #define LOGDPPI(level_, MSG, dpp, ...) \
-  LOCAL_LOGGER.log(level_, "{} {}: " MSG, \
+  LOCAL_LOGGER.log(level_, "{} {} {}: " MSG, \
   interruptor::get_interrupt_cond(), dpp, FNAME , ##__VA_ARGS__)
 #define TRACEDPP(...) LOGDPP(seastar::log_level::trace, __VA_ARGS__)
 #define TRACEDPPI(...) LOGDPPI(seastar::log_level::trace, __VA_ARGS__)
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index fe09cc54510..0dca695ba3a 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -75,14 +75,15 @@ public:
       CollectionRef c,
       const ghobject_t& oid) = 0;
 
-    using omap_values_t = std::map<std::string, ceph::bufferlist, std::less<>>;
+    using omap_values_t = attrs_t;
     using omap_keys_t = std::set<std::string>;
     virtual read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
       const omap_keys_t& keys) = 0;
 
-    virtual read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
+    using omap_values_paged_t = std::tuple<bool, omap_values_t>;
+    virtual read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -147,7 +148,8 @@ public:
       return seastar::now();
     }
 
-    virtual read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    using fiemap_ret_t = std::map<uint64_t, uint64_t>;
+    virtual read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index cf8d3c0891d..5dcb7514ee1 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -990,8 +990,12 @@ void Cache::mark_transaction_conflicted(
     }
     efforts.mutate_delta_bytes += delta_stat.bytes;
 
-    for (auto &i: t.pre_alloc_list) {
-      epm.mark_space_free(i->get_paddr(), i->get_length());
+    if (t.get_pending_ool()) {
+      t.get_pending_ool()->is_conflicted = true;
+    } else {
+      for (auto &i: t.pre_alloc_list) {
+	epm.mark_space_free(i->get_paddr(), i->get_length());
+      }
     }
 
     auto& ool_stats = t.get_ool_write_stats();
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
index cdad6dfb1b0..76c18bde667 100644
--- a/src/crimson/os/seastore/cached_extent.cc
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -158,12 +158,14 @@ parent_tracker_t::~parent_tracker_t() {
 
 std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
 {
-  out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length()
+  out << "LBAMapping(" << rhs.get_key()
+      << "~0x" << std::hex << rhs.get_length() << std::dec
       << "->" << rhs.get_val();
   if (rhs.is_indirect()) {
-    out << " indirect(" << rhs.get_intermediate_base() << "~"
-	<< rhs.get_intermediate_key() << "~"
-	<< rhs.get_intermediate_length() << ")";
+    out << ",indirect(" << rhs.get_intermediate_base()
+        << "~0x" << std::hex << rhs.get_intermediate_length()
+        << "@0x" << rhs.get_intermediate_offset() << std::dec
+        << ")";
   }
   out << ")";
   return out;
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 6c5c6c6fcc2..6025725aa33 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -350,7 +350,7 @@ public:
 	<< ", modify_time=" << sea_time_point_printer_t{modify_time}
 	<< ", paddr=" << get_paddr()
 	<< ", prior_paddr=" << prior_poffset_str
-	<< ", length=" << get_length()
+	<< std::hex << ", length=0x" << get_length() << std::dec
 	<< ", state=" << state
 	<< ", last_committed_crc=" << last_committed_crc
 	<< ", refcount=" << use_count()
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
index 34ac199eed8..0458fbfed74 100644
--- a/src/crimson/os/seastore/extent_placement_manager.cc
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -987,7 +987,19 @@ RandomBlockOolWriter::alloc_write_ool_extents(
     return alloc_write_iertr::now();
   }
   return seastar::with_gate(write_guard, [this, &t, &extents] {
-    return do_write(t, extents);
+    seastar::lw_shared_ptr<rbm_pending_ool_t> ptr =
+      seastar::make_lw_shared<rbm_pending_ool_t>();
+    ptr->pending_extents = t.get_pre_alloc_list();
+    assert(!t.is_conflicted());
+    t.set_pending_ool(ptr);
+    return do_write(t, extents
+    ).finally([this, ptr=ptr] {
+      if (ptr->is_conflicted) {
+	for (auto &e : ptr->pending_extents) {
+	  rb_cleaner->mark_space_free(e->get_paddr(), e->get_length());
+	}
+      }
+    });
   });
 }
 
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
index 5d6fa3cb1b1..ef10ff9623b 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -173,16 +173,22 @@ public:
     if (!parent_modified()) {
       return;
     }
+    LOG_PREFIX(BtreeLBAMapping::maybe_fix_pos);
     auto &p = static_cast<LBALeafNode&>(*parent);
     p.maybe_fix_mapping_pos(*this);
+    SUBDEBUGT(seastore_lba, "fixed pin {}",
+              ctx.trans, static_cast<LBAMapping&>(*this));
   }
 
   LBAMappingRef refresh_with_pending_parent() final {
+    LOG_PREFIX(BtreeLBAMapping::refresh_with_pending_parent);
     assert(is_parent_valid() && !is_parent_viewable());
     auto &p = static_cast<LBALeafNode&>(*parent);
     auto &viewable_p = static_cast<LBALeafNode&>(
       *p.find_pending_version(ctx.trans, get_key()));
-    return viewable_p.get_mapping(ctx, get_key());
+    auto new_pin = viewable_p.get_mapping(ctx, get_key());
+    SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin));
+    return new_pin;
   }
 protected:
   std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
index 960ea6ba411..397a014a7c3 100644
--- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_layout.h
@@ -925,7 +925,7 @@ class NodeLayoutT final : public InternalNodeImpl, public LeafNodeImpl {
     std::ostringstream sos;
     sos << "Node" << NODE_TYPE << FIELD_TYPE
         << "@" << extent.get_laddr()
-        << "+" << std::hex << extent.get_length() << std::dec
+        << "+0x" << std::hex << extent.get_length() << std::dec
         << "Lv" << (unsigned)level()
         << (is_level_tail() ? "$" : "");
     name = sos.str();
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 15774332373..d90edbb20db 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -17,6 +17,7 @@
 #include "common/safe_io.h"
 #include "include/stringify.h"
 #include "os/Transaction.h"
+#include "osd/osd_types_fmt.h"
 
 #include "crimson/common/buffer_io.h"
 
@@ -30,8 +31,6 @@
 #include "crimson/os/seastore/onode_manager.h"
 #include "crimson/os/seastore/object_data_handler.h"
 
-
-using std::string;
 using crimson::common::local_conf;
 
 template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
@@ -42,8 +41,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
   auto format(op_type_t op, FormatContext& ctx) const {
     std::string_view name = "unknown";
     switch (op) {
-      case op_type_t::TRANSACTION:
-      name = "transaction";
+      case op_type_t::DO_TRANSACTION:
+      name = "do_transaction";
       break;
     case op_type_t::READ:
       name = "read";
@@ -63,8 +62,8 @@ template <> struct fmt::formatter<crimson::os::seastore::op_type_t>
     case op_type_t::OMAP_GET_VALUES:
       name = "omap_get_values";
       break;
-    case op_type_t::OMAP_LIST:
-      name = "omap_list";
+    case op_type_t::OMAP_GET_VALUES2:
+      name = "omap_get_values2";
       break;
     case op_type_t::MAX:
       name = "unknown";
@@ -143,14 +142,14 @@ void SeaStore::Shard::register_metrics()
   namespace sm = seastar::metrics;
   using op_type_t = crimson::os::seastore::op_type_t;
   std::pair<op_type_t, sm::label_instance> labels_by_op_type[] = {
-    {op_type_t::TRANSACTION,     sm::label_instance("latency", "TRANSACTION")},
-    {op_type_t::READ,            sm::label_instance("latency", "READ")},
-    {op_type_t::WRITE,           sm::label_instance("latency", "WRITE")},
-    {op_type_t::GET_ATTR,        sm::label_instance("latency", "GET_ATTR")},
-    {op_type_t::GET_ATTRS,       sm::label_instance("latency", "GET_ATTRS")},
-    {op_type_t::STAT,            sm::label_instance("latency", "STAT")},
-    {op_type_t::OMAP_GET_VALUES, sm::label_instance("latency",  "OMAP_GET_VALUES")},
-    {op_type_t::OMAP_LIST,       sm::label_instance("latency", "OMAP_LIST")},
+    {op_type_t::DO_TRANSACTION,   sm::label_instance("latency", "DO_TRANSACTION")},
+    {op_type_t::READ,             sm::label_instance("latency", "READ")},
+    {op_type_t::WRITE,            sm::label_instance("latency", "WRITE")},
+    {op_type_t::GET_ATTR,         sm::label_instance("latency", "GET_ATTR")},
+    {op_type_t::GET_ATTRS,        sm::label_instance("latency", "GET_ATTRS")},
+    {op_type_t::STAT,             sm::label_instance("latency", "STAT")},
+    {op_type_t::OMAP_GET_VALUES,  sm::label_instance("latency", "OMAP_GET_VALUES")},
+    {op_type_t::OMAP_GET_VALUES2, sm::label_instance("latency", "OMAP_GET_VALUES2")},
   };
 
   for (auto& [op_type, label] : labels_by_op_type) {
@@ -194,6 +193,9 @@ void SeaStore::Shard::register_metrics()
 
 seastar::future<> SeaStore::start()
 {
+  LOG_PREFIX(SeaStore::start);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
 #ifndef NDEBUG
   bool is_test = true;
@@ -214,19 +216,30 @@ seastar::future<> SeaStore::start()
   }).then([this, is_test] {
     ceph_assert(device);
     return shard_stores.start(root, device.get(), is_test);
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 seastar::future<> SeaStore::test_start(DeviceRef device_obj)
 {
+  LOG_PREFIX(SeaStore::test_start);
+  INFO("...");
+
   ceph_assert(device_obj);
   ceph_assert(root == "");
   device = std::move(device_obj);
-  return shard_stores.start_single(root, device.get(), true);
+  return shard_stores.start_single(root, device.get(), true
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 seastar::future<> SeaStore::stop()
 {
+  LOG_PREFIX(SeaStore::stop);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return seastar::do_for_each(secondaries, [](auto& sec_dev) {
     return sec_dev->stop();
@@ -239,17 +252,28 @@ seastar::future<> SeaStore::stop()
     }
   }).then([this] {
     return shard_stores.stop();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::test_mount()
 {
+  LOG_PREFIX(SeaStore::test_mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return shard_stores.local().mount_managers();
+  return shard_stores.local().mount_managers(
+  ).then([FNAME] {
+    INFO("done");
+  });
 }
 
 SeaStore::mount_ertr::future<> SeaStore::mount()
 {
+  LOG_PREFIX(SeaStore::mount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return device->mount(
   ).safe_then([this] {
@@ -278,11 +302,13 @@ SeaStore::mount_ertr::future<> SeaStore::mount()
           return set_secondaries();
         });
       });
-    }).safe_then([this] {
-      return shard_stores.invoke_on_all([](auto &local_store) {
-        return local_store.mount_managers();
-      });
     });
+  }).safe_then([this] {
+    return shard_stores.invoke_on_all([](auto &local_store) {
+      return local_store.mount_managers();
+    });
+  }).safe_then([FNAME] {
+    INFO("done");
   }).handle_error(
     crimson::ct_error::assert_all{
       "Invalid error in SeaStore::mount"
@@ -302,9 +328,14 @@ seastar::future<> SeaStore::Shard::mount_managers()
 
 seastar::future<> SeaStore::umount()
 {
+  LOG_PREFIX(SeaStore::umount);
+  INFO("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.invoke_on_all([](auto &local_store) {
     return local_store.umount();
+  }).then([FNAME] {
+    INFO("done");
   });
 }
 
@@ -332,7 +363,7 @@ seastar::future<> SeaStore::Shard::umount()
     onode_manager.reset();
   }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::umount"
+      "Invalid error in SeaStoreS::umount"
     }
   );
 }
@@ -345,15 +376,15 @@ seastar::future<> SeaStore::write_fsid(uuid_d new_osd_fsid)
     auto [ret, fsid] = tuple;
     std::string str_fsid = stringify(new_osd_fsid);
     if (ret == -1) {
-       return write_meta("fsid", stringify(new_osd_fsid));
+      return write_meta("fsid", stringify(new_osd_fsid));
     } else if (ret == 0 && fsid != str_fsid) {
-       ERROR("on-disk fsid {} != provided {}",
-         fsid, stringify(new_osd_fsid));
-       throw std::runtime_error("store fsid error");
-     } else {
+      ERROR("on-disk fsid {} != provided {}",
+            fsid, stringify(new_osd_fsid));
+      throw std::runtime_error("store fsid error");
+    } else {
       return seastar::now();
-     }
-   });
+    }
+  });
 }
 
 seastar::future<>
@@ -379,6 +410,8 @@ SeaStore::Shard::mkfs_managers()
 	"mkfs_seastore",
 	[this](auto& t)
       {
+        LOG_PREFIX(SeaStoreS::mkfs_managers);
+        DEBUGT("...", t);
 	return onode_manager->mkfs(t
 	).si_then([this, &t] {
 	  return collection_manager->mkfs(t);
@@ -412,15 +445,22 @@ seastar::future<> SeaStore::set_secondaries()
 
 SeaStore::mkfs_ertr::future<> SeaStore::test_mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::test_mkfs);
+  INFO("uuid={} ...", new_osd_fsid);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } 
     return shard_stores.local().mkfs_managers(
     ).then([this, new_osd_fsid] {
       return prepare_meta(new_osd_fsid);
+    }).then([FNAME] {
+      INFO("done");
     });
   });
 }
@@ -448,27 +488,29 @@ seastar::future<> SeaStore::prepare_meta(uuid_d new_osd_fsid)
 
 SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
 {
+  LOG_PREFIX(SeaStore::mkfs);
+  INFO("uuid={}, root={} ...", new_osd_fsid, root);
+
   ceph_assert(seastar::this_shard_id() == primary_core);
-  return read_meta("mkfs_done").then([this, new_osd_fsid] (auto tuple) {
+  return read_meta("mkfs_done"
+  ).then([this, new_osd_fsid, FNAME](auto tuple) {
     auto [done, value] = tuple;
     if (done == 0) {
+      ERROR("failed");
       return seastar::now();
     } else {
       return seastar::do_with(
         secondary_device_set_t(),
-        [this, new_osd_fsid](auto& sds) {
+        [this, new_osd_fsid, FNAME](auto& sds) {
         auto fut = seastar::now();
-        LOG_PREFIX(SeaStore::mkfs);
-        DEBUG("root: {}", root);
         if (!root.empty()) {
           fut = seastar::open_directory(root
-          ).then([this, &sds, new_osd_fsid](seastar::file rdir) mutable {
+          ).then([this, &sds, new_osd_fsid, FNAME](seastar::file rdir) mutable {
             std::unique_ptr<seastar::file> root_f =
               std::make_unique<seastar::file>(std::move(rdir));
             auto sub = root_f->list_directory(
-              [this, &sds, new_osd_fsid](auto de) mutable -> seastar::future<>
+              [this, &sds, new_osd_fsid, FNAME](auto de) mutable -> seastar::future<>
             {
-              LOG_PREFIX(SeaStore::mkfs);
               DEBUG("found file: {}", de.name);
               if (de.name.find("block.") == 0
                   && de.name.length() > 6 /* 6 for "block." */) {
@@ -533,6 +575,8 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
         return prepare_meta(new_osd_fsid);
       }).safe_then([this] {
 	return umount();
+      }).safe_then([FNAME] {
+        INFO("done");
       }).handle_error(
         crimson::ct_error::assert_all{
           "Invalid error in SeaStore::mkfs"
@@ -542,18 +586,22 @@ SeaStore::mkfs_ertr::future<> SeaStore::mkfs(uuid_d new_osd_fsid)
   });
 }
 
-using coll_core_t = FuturizedStore::coll_core_t;
+using coll_core_t = SeaStore::coll_core_t;
 seastar::future<std::vector<coll_core_t>>
 SeaStore::list_collections()
 {
+  LOG_PREFIX(SeaStore::list_collections);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map([](auto &local_store) {
     return local_store.list_collections();
-  }).then([](std::vector<std::vector<coll_core_t>> results) {
+  }).then([FNAME](std::vector<std::vector<coll_core_t>> results) {
     std::vector<coll_core_t> collections;
     for (auto& colls : results) {
       collections.insert(collections.end(), colls.begin(), colls.end());
     }
+    DEBUG("got {} collections", collections.size());
     return seastar::make_ready_future<std::vector<coll_core_t>>(
       std::move(collections));
   });
@@ -561,14 +609,18 @@ SeaStore::list_collections()
 
 store_statfs_t SeaStore::Shard::stat() const
 {
-  return transaction_manager->store_stat();
+  LOG_PREFIX(SeaStoreS::stat);
+  auto ss = transaction_manager->store_stat();
+  DEBUG("stat={}", ss);
+  return ss;
 }
 
 seastar::future<store_statfs_t> SeaStore::stat() const
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::stat);
-  DEBUG("");
+  DEBUG("...");
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
   return shard_stores.map_reduce0(
     [](const SeaStore::Shard &local_store) {
       return local_store.stat();
@@ -578,19 +630,30 @@ seastar::future<store_statfs_t> SeaStore::stat() const
       ss.add(ret);
       return std::move(ss);
     }
-  ).then([](store_statfs_t ss) {
+  ).then([FNAME](store_statfs_t ss) {
+    DEBUG("done, stat={}", ss);
     return seastar::make_ready_future<store_statfs_t>(std::move(ss));
   });
 }
 
 seastar::future<store_statfs_t> SeaStore::pool_statfs(int64_t pool_id) const
 {
-   //TODO
-   return SeaStore::stat();
+  LOG_PREFIX(SeaStore::pool_statfs);
+  DEBUG("pool_id={} ...", pool_id);
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  //TODO
+  return SeaStore::stat(
+  ).then([FNAME, pool_id](store_statfs_t ss) {
+    DEBUG("done, pool_id={}, ret={}", pool_id, ss);
+    return seastar::make_ready_future<store_statfs_t>(std::move(ss));
+  });
 }
 
 seastar::future<> SeaStore::report_stats()
 {
+  LOG_PREFIX(SeaStore::report_stats);
+  DEBUG("...");
+
   ceph_assert(seastar::this_shard_id() == primary_core);
   shard_device_stats.resize(seastar::smp::count);
   shard_io_stats.resize(seastar::smp::count);
@@ -609,8 +672,7 @@ seastar::future<> SeaStore::report_stats()
       local_store.get_io_stats(report_detail, seconds);
     shard_cache_stats[seastar::this_shard_id()] =
       local_store.get_cache_stats(report_detail, seconds);
-  }).then([this] {
-    LOG_PREFIX(SeaStore);
+  }).then([this, FNAME] {
     auto now = seastar::lowres_clock::now();
     if (last_tp == seastar::lowres_clock::time_point::min()) {
       last_tp = now;
@@ -857,24 +919,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
         "list_objects",
         [this, ch, start, end, &limit, &ret](auto &t)
       {
+        LOG_PREFIX(SeaStoreS::list_objects);
+        DEBUGT("cid={} start={} end={} limit={} ...",
+               t, ch->get_cid(), start, end, limit);
         return get_coll_bits(
           ch, t
-	).si_then([this, ch, &t, start, end, &limit, &ret](auto bits) {
+	).si_then([FNAME, this, ch, &t, start, end, &limit, &ret](auto bits) {
           if (!bits) {
+            DEBUGT("no bits, return none", t);
             return list_iertr::make_ready_future<
               OnodeManager::list_onodes_bare_ret
 	      >(std::make_tuple(
 		  std::vector<ghobject_t>(),
 		  ghobject_t::get_max()));
           } else {
-	    LOG_PREFIX(SeaStore::list_objects);
-	    DEBUGT("start {}, end {}, limit {}, bits {}",
-	      t, start, end, limit, *bits);
+	    DEBUGT("bits={} ...", t, *bits);
             auto filter = SeaStore::get_objs_range(ch, *bits);
 	    using list_iertr = OnodeManager::list_onodes_iertr;
 	    using repeat_ret = list_iertr::future<seastar::stop_iteration>;
             return trans_intr::repeat(
-              [this, &t, &ret, &limit, end,
+              [this, FNAME, &t, &ret, &limit, end,
 	       filter, ranges = get_ranges(ch, start, end, filter)
 	      ]() mutable -> repeat_ret {
 		if (limit == 0 || ranges.empty()) {
@@ -886,11 +950,10 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		auto pstart = ite->first;
 		auto pend = ite->second;
 		ranges.pop_front();
-		LOG_PREFIX(SeaStore::list_objects);
-		DEBUGT("pstart {}, pend {}, limit {}", t, pstart, pend, limit);
+		DEBUGT("pstart {}, pend {}, limit {} ...", t, pstart, pend, limit);
 		return onode_manager->list_onodes(
 		  t, pstart, pend, limit
-		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end]
+		).si_then([&limit, &ret, pend, &t, last=ranges.empty(), end, FNAME]
 			  (auto &&_ret) mutable {
 		  auto &next_objects = std::get<0>(_ret);
 		  auto &ret_objects = std::get<0>(ret);
@@ -901,7 +964,6 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		  std::get<1>(ret) = std::get<1>(_ret);
 		  assert(limit >= next_objects.size());
 		  limit -= next_objects.size();
-		  LOG_PREFIX(SeaStore::list_objects);
 		  DEBUGT("got {} objects, left limit {}",
 		    t, next_objects.size(), limit);
 		  assert(limit == 0 ||
@@ -914,10 +976,13 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 		    seastar::stop_iteration
 		    >(seastar::stop_iteration::no);
 		});
-	      }).si_then([&ret] {
-		return list_iertr::make_ready_future<
-		  OnodeManager::list_onodes_bare_ret>(std::move(ret));
-	      });
+	      }
+            ).si_then([&ret, FNAME] {
+              DEBUG("got {} objects, next={}",
+                    std::get<0>(ret).size(), std::get<1>(ret));
+              return list_iertr::make_ready_future<
+                OnodeManager::list_onodes_bare_ret>(std::move(ret));
+            });
           }
         });
       }).safe_then([&ret](auto&& _ret) {
@@ -927,7 +992,7 @@ SeaStore::Shard::list_objects(CollectionRef ch,
       return std::move(ret);
     }).handle_error(
       crimson::ct_error::assert_all{
-        "Invalid error in SeaStore::list_objects"
+        "Invalid error in SeaStoreS::list_objects"
       }
     );
   }).finally([this] {
@@ -939,23 +1004,26 @@ SeaStore::Shard::list_objects(CollectionRef ch,
 seastar::future<CollectionRef>
 SeaStore::Shard::create_new_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::create_new_collection);
-  DEBUG("{}", cid);
+  LOG_PREFIX(SeaStoreS::create_new_collection);
+  DEBUG("cid={}", cid);
   return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
 }
 
 seastar::future<CollectionRef>
 SeaStore::Shard::open_collection(const coll_t& cid)
 {
-  LOG_PREFIX(SeaStore::open_collection);
-  DEBUG("{}", cid);
-  return list_collections().then([cid, this] (auto colls_cores) {
+  LOG_PREFIX(SeaStoreS::open_collection);
+  DEBUG("cid={} ...", cid);
+  return list_collections(
+  ).then([cid, this, FNAME] (auto colls_cores) {
     if (auto found = std::find(colls_cores.begin(),
                                colls_cores.end(),
                                std::make_pair(cid, seastar::this_shard_id()));
       found != colls_cores.end()) {
+      DEBUG("cid={} exists", cid);
       return seastar::make_ready_future<CollectionRef>(_get_collection(cid));
     } else {
+      DEBUG("cid={} not exists", cid);
       return seastar::make_ready_future<CollectionRef>();
     }
   });
@@ -965,6 +1033,8 @@ seastar::future<>
 SeaStore::Shard::set_collection_opts(CollectionRef c,
                                         const pool_opts_t& opts)
 {
+  LOG_PREFIX(SeaStoreS::set_collection_opts);
+  DEBUG("cid={}, opts={} not implemented", c->get_cid(), opts);
   //TODO
   return seastar::now();
 }
@@ -986,6 +1056,8 @@ SeaStore::Shard::list_collections()
           "list_collections",
           [this, &ret](auto& t)
         {
+          LOG_PREFIX(SeaStoreS::list_collections);
+          DEBUGT("...", t);
           return transaction_manager->read_collection_root(t
           ).si_then([this, &t](auto coll_root) {
             return collection_manager->list(coll_root, t);
@@ -1004,7 +1076,7 @@ SeaStore::Shard::list_collections()
     }
   ).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::list_collections"
+      "Invalid error in SeaStoreS::list_collections"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1012,6 +1084,42 @@ SeaStore::Shard::list_collections()
   });
 }
 
+SeaStore::base_iertr::future<ceph::bufferlist>
+SeaStore::Shard::_read(
+  Transaction& t,
+  Onode& onode,
+  uint64_t offset,
+  std::size_t len,
+  uint32_t op_flags)
+{
+  LOG_PREFIX(SeaStoreS::_read);
+  size_t size = onode.get_layout().size;
+  if (offset >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x}, got none",
+           t, offset, len, size, op_flags);
+    return seastar::make_ready_future<ceph::bufferlist>();
+  }
+
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} flags=0x{:x} ...",
+         t, offset, len, size, op_flags);
+  size_t corrected_len = (len == 0) ?
+    size - offset :
+    std::min(size - offset, len);
+
+  return ObjectDataHandler(max_object_size).read(
+    ObjectDataHandler::context_t{
+      *transaction_manager,
+      t,
+      onode,
+    },
+    offset,
+    corrected_len
+  ).si_then([FNAME, &t](auto bl) {
+    DEBUGT("got bl length=0x{:x}", t, bl.length());
+    return bl;
+  });
+}
+
 SeaStore::Shard::read_errorator::future<ceph::bufferlist>
 SeaStore::Shard::read(
   CollectionRef ch,
@@ -1020,9 +1128,6 @@ SeaStore::Shard::read(
   size_t len,
   uint32_t op_flags)
 {
-  LOG_PREFIX(SeaStore::read);
-  DEBUG("oid {} offset {} len {}", oid, offset, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1030,29 +1135,11 @@ SeaStore::Shard::read(
     ch,
     oid,
     Transaction::src_t::READ,
-    "read_obj",
+    "read",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> ObjectDataHandler::read_ret {
-      size_t size = onode.get_layout().size;
-
-      if (offset >= size) {
-	return seastar::make_ready_future<ceph::bufferlist>();
-      }
-
-      size_t corrected_len = (len == 0) ?
-	size - offset :
-	std::min(size - offset, len);
-
-      return ObjectDataHandler(max_object_size).read(
-        ObjectDataHandler::context_t{
-          *transaction_manager,
-          t,
-          onode,
-        },
-        offset,
-        corrected_len);
-    }
-  ).finally([this] {
+    [this, offset, len, op_flags](auto &t, auto &onode) {
+    return _read(t, onode, offset, len, op_flags);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1063,9 +1150,7 @@ SeaStore::Shard::exists(
   CollectionRef c,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::exists);
-  DEBUG("oid {}", oid);
-
+  LOG_PREFIX(SeaStoreS::exists);
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
@@ -1073,12 +1158,14 @@ SeaStore::Shard::exists(
     c,
     oid,
     Transaction::src_t::READ,
-    "oid_exists",
+    "exists",
     op_type_t::READ,
-    [](auto&, auto&) {
+    [FNAME](auto& t, auto&) {
+    DEBUGT("exists", t);
     return seastar::make_ready_future<bool>(true);
   }).handle_error(
-    crimson::ct_error::enoent::handle([] {
+    crimson::ct_error::enoent::handle([FNAME] {
+      DEBUG("not exists");
       return seastar::make_ready_future<bool>(false);
     }),
     crimson::ct_error::assert_all{"unexpected error"}
@@ -1095,66 +1182,78 @@ SeaStore::Shard::readv(
   interval_set<uint64_t>& m,
   uint32_t op_flags)
 {
+  LOG_PREFIX(SeaStoreS::readv);
+  DEBUG("cid={} oid={} op_flags=0x{:x} {} intervals",
+        ch->get_cid(), _oid, op_flags, m.num_intervals());
+
   return seastar::do_with(
     _oid,
     ceph::bufferlist{},
-    [=, this, &m](auto &oid, auto &ret) {
+    [ch, op_flags, this, FNAME, &m](auto &oid, auto &ret) {
     return crimson::do_for_each(
       m,
-      [=, this, &oid, &ret](auto &p) {
+      [ch, op_flags, this, &oid, &ret](auto &p) {
       return read(
 	ch, oid, p.first, p.second, op_flags
 	).safe_then([&ret](auto bl) {
         ret.claim_append(bl);
       });
-    }).safe_then([&ret] {
+    }).safe_then([&ret, FNAME] {
+      DEBUG("got bl length=0x{:x}", ret.length());
       return read_errorator::make_ready_future<ceph::bufferlist>
         (std::move(ret));
     });
   });
-  return read_errorator::make_ready_future<ceph::bufferlist>();
 }
 
 using crimson::os::seastore::omap_manager::BtreeOMapManager;
 
+SeaStore::Shard::_omap_get_value_ret
+SeaStore::Shard::_get_attr(
+  Transaction& t,
+  Onode& onode,
+  std::string_view name) const
+{
+  LOG_PREFIX(SeaStoreS::_get_attr);
+  auto& layout = onode.get_layout();
+  if (name == OI_ATTR && layout.oi_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+    DEBUGT("got OI_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  if (name == SS_ATTR && layout.ss_size) {
+    ceph::bufferlist bl;
+    bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+    DEBUGT("got SS_ATTR, value length=0x{:x}", t, bl.length());
+    return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
+  }
+  DEBUGT("name={} ...", t, name);
+  return _omap_get_value(
+    t,
+    layout.xattr_root.get(
+      onode.get_metadata_hint(device->get_block_size())),
+    name);
+}
+
 SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 SeaStore::Shard::get_attr(
   CollectionRef ch,
   const ghobject_t& oid,
   std::string_view name) const
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  LOG_PREFIX(SeaStore::get_attr);
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<ceph::bufferlist>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "get_attr",
     op_type_t::GET_ATTR,
-    [=, this](auto &t, auto& onode) -> _omap_get_value_ret {
-      auto& layout = onode.get_layout();
-      if (name == OI_ATTR && layout.oi_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      if (name == SS_ATTR && layout.ss_size) {
-        ceph::bufferlist bl;
-        bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(bl));
-      }
-      return _omap_get_value(
-        t,
-        layout.xattr_root.get(
-          onode.get_metadata_hint(device->get_block_size())),
-        name);
-    }
-  ).handle_error(
+    [this, name](auto &t, auto& onode) {
+    return _get_attr(t, onode, name);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1164,48 +1263,53 @@ SeaStore::Shard::get_attr(
   });
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::attrs_t>
+SeaStore::Shard::_get_attrs(
+  Transaction& t,
+  Onode& onode)
+{
+  LOG_PREFIX(SeaStoreS::_get_attrs);
+  DEBUGT("...", t);
+  auto& layout = onode.get_layout();
+  return omap_list(onode, layout.xattr_root, t, std::nullopt,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([&layout, &t, FNAME](auto p) {
+    auto& attrs = std::get<1>(p);
+    DEBUGT("got {} attrs, OI length=0x{:x}, SS length=0x{:x}",
+           t, attrs.size(), (uint32_t)layout.oi_size, (uint32_t)layout.ss_size);
+    ceph::bufferlist bl;
+    if (layout.oi_size) {
+      bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
+      attrs.emplace(OI_ATTR, std::move(bl));
+    }
+    if (layout.ss_size) {
+      bl.clear();
+      bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
+      attrs.emplace(SS_ATTR, std::move(bl));
+    }
+    return seastar::make_ready_future<attrs_t>(std::move(attrs));
+  });
+}
+
 SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
 SeaStore::Shard::get_attrs(
   CollectionRef ch,
   const ghobject_t& oid)
 {
-  LOG_PREFIX(SeaStore::get_attrs);
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
   return repeat_with_onode<attrs_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
-    "get_addrs",
+    "get_attrs",
     op_type_t::GET_ATTRS,
-    [=, this](auto &t, auto& onode) {
-      auto& layout = onode.get_layout();
-      return omap_list(onode, layout.xattr_root, t, std::nullopt,
-        OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max()
-      ).si_then([&layout, &t, FNAME](auto p) {
-        auto& attrs = std::get<1>(p);
-        ceph::bufferlist bl;
-        if (layout.oi_size) {
-          bl.append(ceph::bufferptr(&layout.oi[0], layout.oi_size));
-          attrs.emplace(OI_ATTR, std::move(bl));
-         DEBUGT("set oi from onode layout", t);
-        }
-        if (layout.ss_size) {
-          bl.clear();
-          bl.append(ceph::bufferptr(&layout.ss[0], layout.ss_size));
-          attrs.emplace(SS_ATTR, std::move(bl));
-         DEBUGT("set ss from onode layout", t);
-        }
-        return seastar::make_ready_future<omap_values_t>(std::move(attrs));
-      });
-    }
-  ).handle_error(
+    [this](auto &t, auto& onode) {
+    return _get_attrs(t, onode);
+  }).handle_error(
     crimson::ct_error::input_output_error::assert_failure{
       "EIO when getting attrs"},
     crimson::ct_error::pass_further_all{}
@@ -1215,6 +1319,23 @@ SeaStore::Shard::get_attrs(
   });
 }
 
+seastar::future<struct stat> SeaStore::Shard::_stat(
+  Transaction& t,
+  Onode& onode,
+  const ghobject_t& oid)
+{
+  LOG_PREFIX(SeaStoreS::_stat);
+  struct stat st;
+  auto &olayout = onode.get_layout();
+  st.st_size = olayout.size;
+  st.st_blksize = device->get_block_size();
+  st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
+  st.st_nlink = 1;
+  DEBUGT("oid={}, size={}, blksize={}",
+         t, oid, st.st_size, st.st_blksize);
+  return seastar::make_ready_future<struct stat>(st);
+}
+
 seastar::future<struct stat> SeaStore::Shard::stat(
   CollectionRef c,
   const ghobject_t& oid)
@@ -1222,26 +1343,17 @@ seastar::future<struct stat> SeaStore::Shard::stat(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  LOG_PREFIX(SeaStore::stat);
   return repeat_with_onode<struct stat>(
     c,
     oid,
     Transaction::src_t::READ,
     "stat",
     op_type_t::STAT,
-    [=, this](auto &t, auto &onode) {
-      struct stat st;
-      auto &olayout = onode.get_layout();
-      st.st_size = olayout.size;
-      st.st_blksize = device->get_block_size();
-      st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
-      st.st_nlink = 1;
-      DEBUGT("cid {}, oid {}, return size {}", t, c->get_cid(), oid, st.st_size);
-      return seastar::make_ready_future<struct stat>(st);
-    }
-  ).handle_error(
+    [this, oid](auto &t, auto &onode) {
+    return _stat(t, onode, oid);
+  }).handle_error(
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::stat"
+      "Invalid error in SeaStoreS::stat"
     }
   ).finally([this] {
     assert(shard_stats.pending_read_num);
@@ -1257,6 +1369,22 @@ SeaStore::Shard::omap_get_header(
   return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
 }
 
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const omap_keys_t& keys)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("{} keys ...", t, keys.size());
+  omap_root_t omap_root = onode.get_layout().omap_root.get(
+    onode.get_metadata_hint(device->get_block_size()));
+  return _omap_get_values(
+    t,
+    std::move(omap_root),
+    keys);
+}
+
 SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
@@ -1266,22 +1394,15 @@ SeaStore::Shard::omap_get_values(
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  auto c = static_cast<SeastoreCollection*>(ch.get());
   return repeat_with_onode<omap_values_t>(
-    c,
+    ch,
     oid,
     Transaction::src_t::READ,
     "omap_get_values",
     op_type_t::OMAP_GET_VALUES,
     [this, keys](auto &t, auto &onode) {
-      omap_root_t omap_root = onode.get_layout().omap_root.get(
-	onode.get_metadata_hint(device->get_block_size()));
-      return _omap_get_values(
-	t,
-	std::move(omap_root),
-	keys);
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, keys);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
@@ -1298,58 +1419,62 @@ SeaStore::Shard::_omap_get_value(
     std::move(root),
     std::string(key),
     [&t](auto &manager, auto& root, auto& key) -> _omap_get_value_ret {
-      if (root.is_null()) {
+    LOG_PREFIX(SeaStoreS::_omap_get_value);
+    if (root.is_null()) {
+      DEBUGT("key={} is absent because of null root", t, key);
+      return crimson::ct_error::enodata::make();
+    }
+    return manager.omap_get_value(root, t, key
+    ).si_then([&key, &t, FNAME](auto opt) -> _omap_get_value_ret {
+      if (!opt) {
+        DEBUGT("key={} is absent", t, key);
         return crimson::ct_error::enodata::make();
       }
-      return manager.omap_get_value(root, t, key
-      ).si_then([](auto opt) -> _omap_get_value_ret {
-        if (!opt) {
-          return crimson::ct_error::enodata::make();
-        }
-        return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
-      });
-    }
-  );
+      DEBUGT("key={}, value length=0x{:x}", t, key, opt->length());
+      return seastar::make_ready_future<ceph::bufferlist>(std::move(*opt));
+    });
+  });
 }
 
-SeaStore::Shard::_omap_get_values_ret
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::_omap_get_values(
   Transaction &t,
   omap_root_t &&omap_root,
   const omap_keys_t &keys) const
 {
+  LOG_PREFIX(SeaStoreS::_omap_get_values);
   if (omap_root.is_null()) {
+    DEBUGT("{} keys are absent because of null root", t, keys.size());
     return seastar::make_ready_future<omap_values_t>();
   }
   return seastar::do_with(
     BtreeOMapManager(*transaction_manager),
     std::move(omap_root),
     omap_values_t(),
-    [&](auto &manager, auto &root, auto &ret) {
-      return trans_intr::do_for_each(
-        keys.begin(),
-        keys.end(),
-        [&](auto &key) {
-          return manager.omap_get_value(
-            root,
-            t,
-            key
-          ).si_then([&ret, &key](auto &&p) {
-            if (p) {
-              bufferlist bl;
-              bl.append(*p);
-              ret.emplace(
-                std::move(key),
-                std::move(bl));
-            }
-            return seastar::now();
-          });
+    [&t, &keys, FNAME](auto &manager, auto &root, auto &ret) {
+    return trans_intr::do_for_each(
+      keys.begin(),
+      keys.end(),
+      [&t, &manager, &root, &ret](auto &key) {
+      return manager.omap_get_value(
+        root,
+        t,
+        key
+      ).si_then([&ret, &key](auto &&p) {
+        if (p) {
+          bufferlist bl;
+          bl.append(*p);
+          ret.emplace(
+            std::move(key),
+            std::move(bl));
         }
-      ).si_then([&ret] {
-        return std::move(ret);
+        return seastar::now();
       });
-    }
-  );
+    }).si_then([&t, &ret, &keys, FNAME] {
+      DEBUGT("{} keys got {} values", t, keys.size(), ret.size());
+      return std::move(ret);
+    });
+  });
 }
 
 SeaStore::Shard::omap_list_ret
@@ -1377,51 +1502,74 @@ SeaStore::Shard::omap_list(
   });
 }
 
-SeaStore::Shard::omap_get_values_ret_t
+SeaStore::base_iertr::future<SeaStore::Shard::omap_values_paged_t>
+SeaStore::Shard::do_omap_get_values(
+  Transaction& t,
+  Onode& onode,
+  const std::optional<std::string>& start)
+{
+  LOG_PREFIX(SeaStoreS::do_omap_get_values);
+  DEBUGT("start={} ...", t, start.has_value() ? *start : "");
+  return omap_list(
+    onode,
+    onode.get_layout().omap_root,
+    t,
+    start,
+    OMapManager::omap_list_config_t()
+      .with_inclusive(false, false)
+      .without_max()
+  ).si_then([FNAME, &t](omap_values_paged_t ret) {
+    DEBUGT("got {} values, complete={}",
+           t, std::get<1>(ret).size(), std::get<0>(ret));
+    return ret;
+  });
+}
+
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const std::optional<string> &start)
+  const std::optional<std::string> &start)
 {
-  auto c = static_cast<SeastoreCollection*>(ch.get());
-  LOG_PREFIX(SeaStore::omap_get_values);
-  DEBUG("{} {}", c->get_cid(), oid);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  using ret_bare_t = std::tuple<bool, SeaStore::Shard::omap_values_t>;
-  return repeat_with_onode<ret_bare_t>(
-    c,
+  return repeat_with_onode<omap_values_paged_t>(
+    ch,
     oid,
     Transaction::src_t::READ,
-    "omap_list",
-    op_type_t::OMAP_LIST,
+    "omap_get_values2",
+    op_type_t::OMAP_GET_VALUES2,
     [this, start](auto &t, auto &onode) {
-      return omap_list(
-	onode,
-	onode.get_layout().omap_root,
-	t,
-	start,
-	OMapManager::omap_list_config_t()
-	  .with_inclusive(false, false)
-	  .without_max());
-    }
-  ).finally([this] {
+    return do_omap_get_values(t, onode, start);
+  }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
   });
 }
 
-SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
+SeaStore::base_iertr::future<SeaStore::Shard::fiemap_ret_t>
+SeaStore::Shard::_fiemap(
   Transaction &t,
   Onode &onode,
   uint64_t off,
   uint64_t len) const
 {
+  LOG_PREFIX(SeaStoreS::_fiemap);
+  size_t size = onode.get_layout().size;
+  if (off >= size) {
+    DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x}, got none",
+           t, off, len, size);
+    return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
+  }
+  DEBUGT("0x{:x}~0x{:x} onode-size=0x{:x} ...",
+         t, off, len, size);
+  size_t adjust_len = (len == 0) ?
+    size - off:
+    std::min(size - off, len);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
-    [=, this, &t, &onode] (auto &objhandler) {
+    [this, off, adjust_len, &t, &onode](auto &objhandler) {
     return objhandler.fiemap(
       ObjectDataHandler::context_t{
         *transaction_manager,
@@ -1429,39 +1577,31 @@ SeaStore::Shard::_fiemap_ret SeaStore::Shard::_fiemap(
         onode,
       },
       off,
-      len);
+      adjust_len);
+  }).si_then([FNAME, &t](auto ret) {
+    DEBUGT("got {} intervals", t, ret.size());
+    return ret;
   });
 }
 
-SeaStore::Shard::read_errorator::future<std::map<uint64_t, uint64_t>>
+SeaStore::Shard::read_errorator::future<SeaStore::Shard::fiemap_ret_t>
 SeaStore::Shard::fiemap(
   CollectionRef ch,
   const ghobject_t& oid,
   uint64_t off,
   uint64_t len)
 {
-  LOG_PREFIX(SeaStore::fiemap);
-  DEBUG("oid: {}, off: {}, len: {} ", oid, off, len);
-
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
 
-  return repeat_with_onode<std::map<uint64_t, uint64_t>>(
+  return repeat_with_onode<fiemap_ret_t>(
     ch,
     oid,
     Transaction::src_t::READ,
-    "fiemap_read",
+    "fiemap",
     op_type_t::READ,
-    [=, this](auto &t, auto &onode) -> _fiemap_ret {
-    size_t size = onode.get_layout().size;
-    if (off >= size) {
-      INFOT("fiemap offset is over onode size!", t);
-      return seastar::make_ready_future<std::map<uint64_t, uint64_t>>();
-    }
-    size_t adjust_len = (len == 0) ?
-      size - off:
-      std::min(size - off, len);
-    return _fiemap(t, onode, off, adjust_len);
+    [this, off, len](auto &t, auto &onode) {
+    return _fiemap(t, onode, off, len);
   }).finally([this] {
     assert(shard_stats.pending_read_num);
     --(shard_stats.pending_read_num);
@@ -1469,7 +1609,7 @@ SeaStore::Shard::fiemap(
 }
 
 void SeaStore::Shard::on_error(ceph::os::Transaction &t) {
-  LOG_PREFIX(SeaStore::on_error);
+  LOG_PREFIX(SeaStoreS::on_error);
   ERROR(" transaction dump:\n");
   JSONFormatter f(true);
   f.open_object_section("transaction");
@@ -1490,17 +1630,22 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
   ++(shard_stats.starting_io_num);
 
   // repeat_with_internal_context ensures ordering via collection lock
+  auto num_bytes = _t.get_num_bytes();
   return repeat_with_internal_context(
     _ch,
     std::move(_t),
     Transaction::src_t::MUTATE,
     "do_transaction",
-    op_type_t::TRANSACTION,
-    [this](auto &ctx) {
-      return with_trans_intr(*ctx.transaction, [&, this](auto &t) {
-        LOG_PREFIX(SeaStore::Shard::do_transaction_no_callbacks);
-        SUBDEBUGT(seastore_t, "start with {} objects",
-                  t, ctx.iter.objects.size());
+    op_type_t::DO_TRANSACTION,
+    [this, num_bytes](auto &ctx) {
+      LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks);
+      return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) {
+        DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...",
+               t, ctx.ch->get_cid(),
+               ctx.ext_transaction.get_num_ops(),
+               num_bytes,
+               ctx.iter.colls.size(),
+               ctx.iter.objects.size());
 #ifndef NDEBUG
 	TRACET(" transaction dump:\n", t);
 	JSONFormatter f(true);
@@ -1534,6 +1679,8 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
         }).si_then([this, &ctx] {
           return transaction_manager->submit_transaction(*ctx.transaction);
         });
+      }).safe_then([FNAME, &ctx] {
+        DEBUGT("done", *ctx.transaction);
       });
     }
   ).finally([this] {
@@ -1573,27 +1720,31 @@ SeaStore::Shard::_do_transaction_step(
   std::vector<OnodeRef> &d_onodes,
   ceph::os::Transaction::iterator &i)
 {
-  LOG_PREFIX(SeaStore::Shard::_do_transaction_step);
+  LOG_PREFIX(SeaStoreS::_do_transaction_step);
   auto op = i.decode_op();
-  SUBTRACET(seastore_t, "got op {}", *ctx.transaction, (uint32_t)op->op);
 
   using ceph::os::Transaction;
-  if (op->op == Transaction::OP_NOP)
+  if (op->op == Transaction::OP_NOP) {
+    DEBUGT("op NOP", *ctx.transaction);
     return tm_iertr::now();
+  }
 
   switch (op->op) {
     case Transaction::OP_RMCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op RMCOLL, cid={} ...", *ctx.transaction, cid);
       return _remove_collection(ctx, cid);
     }
     case Transaction::OP_MKCOLL:
     {
       coll_t cid = i.get_cid(op->cid);
+      DEBUGT("op MKCOLL, cid={} ...", *ctx.transaction, cid);
       return _create_collection(ctx, cid, op->split_bits);
     }
     case Transaction::OP_COLL_HINT:
     {
+      DEBUGT("op COLL_HINT", *ctx.transaction);
       ceph::bufferlist hint;
       i.decode_bl(hint);
       return tm_iertr::now();
@@ -1611,14 +1762,18 @@ SeaStore::Shard::_do_transaction_step(
     create = true;
   }
   if (!onodes[op->oid]) {
+    const ghobject_t& oid = i.get_oid(op->oid);
     if (!create) {
-      fut = onode_manager->get_onode(*ctx.transaction, i.get_oid(op->oid));
+      DEBUGT("op {}, get oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
+      fut = onode_manager->get_onode(*ctx.transaction, oid);
     } else {
-      fut = onode_manager->get_or_create_onode(
-        *ctx.transaction, i.get_oid(op->oid));
+      DEBUGT("op {}, get_or_create oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, oid);
+      fut = onode_manager->get_or_create_onode(*ctx.transaction, oid);
     }
   }
-  return fut.si_then([&, op](auto get_onode) {
+  return fut.si_then([&, op, this, FNAME](auto get_onode) {
     OnodeRef &o = onodes[op->oid];
     if (!o) {
       assert(get_onode);
@@ -1628,11 +1783,13 @@ SeaStore::Shard::_do_transaction_step(
     if ((op->op == Transaction::OP_CLONE
 	  || op->op == Transaction::OP_COLL_MOVE_RENAME)
 	&& !d_onodes[op->dest_oid]) {
+      const ghobject_t& dest_oid = i.get_oid(op->dest_oid);
+      DEBUGT("op {}, get_or_create dest oid={} ...",
+             *ctx.transaction, (uint32_t)op->op, dest_oid);
       //TODO: use when_all_succeed after making onode tree
       //      support parallel extents loading
-      return onode_manager->get_or_create_onode(
-	*ctx.transaction, i.get_oid(op->dest_oid)
-      ).si_then([&, op](auto dest_onode) {
+      return onode_manager->get_or_create_onode(*ctx.transaction, dest_oid
+      ).si_then([&onodes, &d_onodes, op](auto dest_onode) {
 	assert(dest_onode);
 	auto &d_o = onodes[op->dest_oid];
 	assert(!d_o);
@@ -1644,13 +1801,13 @@ SeaStore::Shard::_do_transaction_step(
     } else {
       return OnodeManager::get_or_create_onode_iertr::now();
     }
-  }).si_then([&, op, this]() -> tm_ret {
-    LOG_PREFIX(SeaStore::_do_transaction_step);
+  }).si_then([&ctx, &i, &onodes, &d_onodes, op, this, FNAME]() -> tm_ret {
+    const ghobject_t& oid = i.get_oid(op->oid);
     try {
       switch (op->op) {
       case Transaction::OP_REMOVE:
       {
-	TRACET("removing {}", *ctx.transaction, i.get_oid(op->oid));
+        DEBUGT("op REMOVE, oid={} ...", *ctx.transaction, oid);
         return _remove(ctx, onodes[op->oid]
 	).si_then([&onodes, &d_onodes, op] {
 	  onodes[op->oid].reset();
@@ -1660,6 +1817,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_CREATE:
       case Transaction::OP_TOUCH:
       {
+        DEBUGT("op CREATE/TOUCH, oid={} ...", *ctx.transaction, oid);
         return _touch(ctx, onodes[op->oid]);
       }
       case Transaction::OP_WRITE:
@@ -1669,6 +1827,8 @@ SeaStore::Shard::_do_transaction_step(
         uint32_t fadvise_flags = i.get_fadvise_flags();
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op WRITE, oid={}, 0x{:x}~0x{:x}, flags=0x{:x} ...",
+               *ctx.transaction, oid, off, len, fadvise_flags);
         return _write(
 	  ctx, onodes[op->oid], off, len, std::move(bl),
 	  fadvise_flags);
@@ -1676,6 +1836,7 @@ SeaStore::Shard::_do_transaction_step(
       case Transaction::OP_TRUNCATE:
       {
         uint64_t off = op->off;
+        DEBUGT("op TRUNCATE, oid={}, 0x{:x} ...", *ctx.transaction, oid, off);
         return _truncate(ctx, onodes[op->oid], off);
       }
       case Transaction::OP_SETATTR:
@@ -1684,80 +1845,96 @@ SeaStore::Shard::_do_transaction_step(
         std::map<std::string, bufferlist> to_set;
         ceph::bufferlist& bl = to_set[name];
         i.decode_bl(bl);
+        DEBUGT("op SETATTR, oid={}, attr name={}, value length=0x{:x} ...",
+               *ctx.transaction, oid, name, bl.length());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_SETATTRS:
       {
         std::map<std::string, bufferlist> to_set;
         i.decode_attrset(to_set);
+        DEBUGT("op SETATTRS, oid={}, attrs size={} ...",
+               *ctx.transaction, oid, to_set.size());
         return _setattrs(ctx, onodes[op->oid], std::move(to_set));
       }
       case Transaction::OP_RMATTR:
       {
         std::string name = i.decode_string();
+        DEBUGT("op RMATTR, oid={}, attr name={} ...",
+               *ctx.transaction, oid, name);
         return _rmattr(ctx, onodes[op->oid], name);
       }
       case Transaction::OP_RMATTRS:
       {
+        DEBUGT("op RMATTRS, oid={} ...", *ctx.transaction, oid);
         return _rmattrs(ctx, onodes[op->oid]);
       }
       case Transaction::OP_OMAP_SETKEYS:
       {
         std::map<std::string, ceph::bufferlist> aset;
         i.decode_attrset(aset);
+        DEBUGT("op OMAP_SETKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, aset.size());
         return _omap_set_values(ctx, onodes[op->oid], std::move(aset));
       }
       case Transaction::OP_OMAP_SETHEADER:
       {
         ceph::bufferlist bl;
         i.decode_bl(bl);
+        DEBUGT("op OMAP_SETHEADER, oid={}, length=0x{:x} ...",
+               *ctx.transaction, oid, bl.length());
         return _omap_set_header(ctx, onodes[op->oid], std::move(bl));
       }
       case Transaction::OP_OMAP_RMKEYS:
       {
         omap_keys_t keys;
         i.decode_keyset(keys);
+        DEBUGT("op OMAP_RMKEYS, oid={}, omap size={} ...",
+               *ctx.transaction, oid, keys.size());
         return _omap_rmkeys(ctx, onodes[op->oid], std::move(keys));
       }
       case Transaction::OP_OMAP_RMKEYRANGE:
       {
-        string first, last;
+        std::string first, last;
         first = i.decode_string();
         last = i.decode_string();
+        DEBUGT("op OMAP_RMKEYRANGE, oid={}, first={}, last={} ...",
+               *ctx.transaction, oid, first, last);
         return _omap_rmkeyrange(
 	  ctx, onodes[op->oid],
 	  std::move(first), std::move(last));
       }
       case Transaction::OP_OMAP_CLEAR:
       {
+        DEBUGT("op OMAP_CLEAR, oid={} ...", *ctx.transaction, oid);
         return _omap_clear(ctx, onodes[op->oid]);
       }
       case Transaction::OP_ZERO:
       {
         objaddr_t off = op->off;
         extent_len_t len = op->len;
+        DEBUGT("op ZERO, oid={}, 0x{:x}~0x{:x} ...",
+               *ctx.transaction, oid, off, len);
         return _zero(ctx, onodes[op->oid], off, len);
       }
       case Transaction::OP_SETALLOCHINT:
       {
+        DEBUGT("op SETALLOCHINT, oid={}, not implemented",
+               *ctx.transaction, oid);
         // TODO
         return tm_iertr::now();
       }
       case Transaction::OP_CLONE:
       {
-	TRACET("cloning {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
+        DEBUGT("op CLONE, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	return _clone(ctx, onodes[op->oid], d_onodes[op->dest_oid]);
       }
       case Transaction::OP_COLL_MOVE_RENAME:
       {
+        DEBUGT("op COLL_MOVE_RENAME, oid={}, dest oid={} ...",
+               *ctx.transaction, oid, i.get_oid(op->dest_oid));
 	ceph_assert(op->cid == op->dest_cid);
-	TRACET("renaming {} to {}",
-	  *ctx.transaction,
-	  i.get_oid(op->oid),
-	  i.get_oid(op->dest_oid));
 	return _rename(
 	  ctx, onodes[op->oid], d_onodes[op->dest_oid]
 	).si_then([&onodes, &d_onodes, op] {
@@ -1793,7 +1970,7 @@ SeaStore::Shard::_do_transaction_step(
       return seastar::now();
     }),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::do_transaction_step"
+      "Invalid error in SeaStoreS::do_transaction_step"
     }
   );
 }
@@ -1829,7 +2006,7 @@ SeaStore::Shard::_rename(
   ).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_rename"}
+      "Invalid error in SeaStoreS::_rename"}
   );
 }
 
@@ -1850,7 +2027,7 @@ SeaStore::Shard::_remove_omaps(
       ).handle_error_interruptible(
 	crimson::ct_error::input_output_error::pass_further(),
 	crimson::ct_error::assert_all{
-	  "Invalid error in SeaStore::_remove"
+	  "Invalid error in SeaStoreS::_remove_omaps"
 	}
       );
     });
@@ -1863,8 +2040,6 @@ SeaStore::Shard::_remove(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_remove);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return _remove_omaps(
     ctx,
     onode,
@@ -1892,7 +2067,7 @@ SeaStore::Shard::_remove(
   }).handle_error_interruptible(
     crimson::ct_error::input_output_error::pass_further(),
     crimson::ct_error::assert_all(
-      "Invalid error in SeaStore::_remove"
+      "Invalid error in SeaStoreS::_remove"
     )
   );
 }
@@ -1902,8 +2077,6 @@ SeaStore::Shard::_touch(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_touch);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   return tm_iertr::now();
 }
 
@@ -1915,8 +2088,6 @@ SeaStore::Shard::_write(
   ceph::bufferlist &&_bl,
   uint32_t fadvise_flags)
 {
-  LOG_PREFIX(SeaStore::_write);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   const auto &object_size = onode->get_layout().size;
   if (offset + len > object_size) {
     onode->update_onode_size(
@@ -2007,8 +2178,6 @@ SeaStore::Shard::_clone(
   OnodeRef &onode,
   OnodeRef &d_onode)
 {
-  LOG_PREFIX(SeaStore::_clone);
-  DEBUGT("onode={} d_onode={}", *ctx.transaction, *onode, *d_onode);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
     [this, &ctx, &onode, &d_onode](auto &objHandler) {
@@ -2034,9 +2203,10 @@ SeaStore::Shard::_zero(
   objaddr_t offset,
   extent_len_t len)
 {
-  LOG_PREFIX(SeaStore::_zero);
-  DEBUGT("onode={} {}~{}", *ctx.transaction, *onode, offset, len);
   if (offset + len >= max_object_size) {
+    LOG_PREFIX(SeaStoreS::_zero);
+    ERRORT("0x{:x}~0x{:x} >= 0x{:x}",
+           *ctx.transaction, offset, len, max_object_size);
     return crimson::ct_error::input_output_error::make();
   }
   const auto &object_size = onode->get_layout().size;
@@ -2092,8 +2262,6 @@ SeaStore::Shard::_omap_set_values(
   OnodeRef &onode,
   std::map<std::string, ceph::bufferlist> &&aset)
 {
-  LOG_PREFIX(SeaStore::_omap_set_values);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, aset.size());
   return _omap_set_kvs(
     onode,
     onode->get_layout().omap_root,
@@ -2112,8 +2280,6 @@ SeaStore::Shard::_omap_set_header(
   OnodeRef &onode,
   ceph::bufferlist &&header)
 {
-  LOG_PREFIX(SeaStore::_omap_set_header);
-  DEBUGT("{} {} bytes", *ctx.transaction, *onode, header.length());
   std::map<std::string, bufferlist> to_set;
   to_set[OMAP_HEADER_XATTR_KEY] = header;
   return _setattrs(ctx, onode,std::move(to_set));
@@ -2124,10 +2290,8 @@ SeaStore::Shard::_omap_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_omap_clear);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode);
-  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY))
-    .si_then([this, &ctx, &onode]() -> tm_ret {
+  return _xattr_rmattr(ctx, onode, std::string(OMAP_HEADER_XATTR_KEY)
+  ).si_then([this, &ctx, &onode]() -> tm_ret {
     if (auto omap_root = onode->get_layout().omap_root.get(
       onode->get_metadata_hint(device->get_block_size()));
       omap_root.is_null()) {
@@ -2142,8 +2306,8 @@ SeaStore::Shard::_omap_clear(
         auto &omap_root) {
         return omap_manager.omap_clear(
           omap_root,
-          *ctx.transaction)
-        .si_then([&] {
+          *ctx.transaction
+        ).si_then([&] {
           if (omap_root.must_update()) {
 	    onode->update_omap_root(*ctx.transaction, omap_root);
           }
@@ -2159,8 +2323,6 @@ SeaStore::Shard::_omap_rmkeys(
   OnodeRef &onode,
   omap_keys_t &&keys)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeys);
-  DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
   auto omap_root = onode->get_layout().omap_root.get(
     onode->get_metadata_hint(device->get_block_size()));
   if (omap_root.is_null()) {
@@ -2201,10 +2363,9 @@ SeaStore::Shard::_omap_rmkeyrange(
   std::string first,
   std::string last)
 {
-  LOG_PREFIX(SeaStore::_omap_rmkeyrange);
-  DEBUGT("{} first={} last={}", *ctx.transaction, *onode, first, last);
   if (first > last) {
-    ERRORT("range error, first: {} > last:{}", *ctx.transaction, first, last);
+    LOG_PREFIX(SeaStoreS::_omap_rmkeyrange);
+    ERRORT("range error, first:{} > last:{}", *ctx.transaction, first, last);
     ceph_abort();
   }
   auto omap_root = onode->get_layout().omap_root.get(
@@ -2247,8 +2408,6 @@ SeaStore::Shard::_truncate(
   OnodeRef &onode,
   uint64_t size)
 {
-  LOG_PREFIX(SeaStore::_truncate);
-  DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
   onode->update_onode_size(*ctx.transaction, size);
   return seastar::do_with(
     ObjectDataHandler(max_object_size),
@@ -2269,9 +2428,7 @@ SeaStore::Shard::_setattrs(
   OnodeRef &onode,
   std::map<std::string, bufferlist>&& aset)
 {
-  LOG_PREFIX(SeaStore::_setattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
-
+  LOG_PREFIX(SeaStoreS::_setattrs);
   auto fut = tm_iertr::now();
   auto& layout = onode->get_layout();
   if (auto it = aset.find(OI_ATTR); it != aset.end()) {
@@ -2333,8 +2490,6 @@ SeaStore::Shard::_rmattr(
   OnodeRef &onode,
   std::string name)
 {
-  LOG_PREFIX(SeaStore::_rmattr);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   auto& layout = onode->get_layout();
   if ((name == OI_ATTR) && (layout.oi_size > 0)) {
     onode->clear_object_info(*ctx.transaction);
@@ -2356,7 +2511,7 @@ SeaStore::Shard::_xattr_rmattr(
   OnodeRef &onode,
   std::string &&name)
 {
-  LOG_PREFIX(SeaStore::_xattr_rmattr);
+  LOG_PREFIX(SeaStoreS::_xattr_rmattr);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2384,8 +2539,6 @@ SeaStore::Shard::_rmattrs(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_rmattrs);
-  DEBUGT("onode={}", *ctx.transaction, *onode);
   onode->clear_object_info(*ctx.transaction);
   onode->clear_snapset(*ctx.transaction);
   return _xattr_clear(ctx, onode);
@@ -2396,7 +2549,7 @@ SeaStore::Shard::_xattr_clear(
   internal_context_t &ctx,
   OnodeRef &onode)
 {
-  LOG_PREFIX(SeaStore::_xattr_clear);
+  LOG_PREFIX(SeaStoreS::_xattr_clear);
   DEBUGT("onode={}", *ctx.transaction, *onode);
   auto xattr_root = onode->get_layout().xattr_root.get(
     onode->get_metadata_hint(device->get_block_size()));
@@ -2446,7 +2599,7 @@ SeaStore::Shard::_create_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2478,7 +2631,7 @@ SeaStore::Shard::_remove_collection(
   }).handle_error_interruptible(
     tm_iertr::pass_further{},
     crimson::ct_error::assert_all{
-      "Invalid error in SeaStore::_create_collection"
+      "Invalid error in SeaStoreS::_create_collection"
     }
   );
 }
@@ -2489,40 +2642,53 @@ SeaStore::Shard::_get_collection(const coll_t& cid)
   return new SeastoreCollection{cid};
 }
 
+seastar::future<> SeaStore::write_meta(
+  const std::string& key,
+  const std::string& value) {
+  LOG_PREFIX(SeaStore::write_meta);
+  DEBUG("key={} value={} ...", key, value);
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return seastar::do_with(key, value,
+    [this, FNAME](auto& key, auto& value) {
+    return shard_stores.local().write_meta(key, value
+    ).then([this, &key, &value] {
+      return mdstore->write_meta(key, value);
+    }).safe_then([FNAME, &key, &value] {
+      DEBUG("key={} value={} done", key, value);
+    }).handle_error(
+      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+    );
+  });
+}
+
 seastar::future<> SeaStore::Shard::write_meta(
   const std::string& key,
   const std::string& value)
 {
-  LOG_PREFIX(SeaStore::write_meta);
-  DEBUG("key: {}; value: {}", key, value);
-
   ++(shard_stats.io_num);
   ++(shard_stats.pending_io_num);
   // For TM::submit_transaction()
   ++(shard_stats.processing_inlock_io_num);
 
-  return seastar::do_with(
-      key, value,
-      [this, FNAME](auto& key, auto& value) {
-	return repeat_eagain([this, FNAME, &key, &value] {
-	  ++(shard_stats.repeat_io_num);
-
-	  return transaction_manager->with_transaction_intr(
-	    Transaction::src_t::MUTATE,
-            "write_meta",
-	    [this, FNAME, &key, &value](auto& t)
-          {
-            DEBUGT("Have transaction, key: {}; value: {}", t, key, value);
-            return transaction_manager->update_root_meta(
-              t, key, value
-            ).si_then([this, &t] {
-              return transaction_manager->submit_transaction(t);
-            });
-          });
-	});
-      }
-  ).handle_error(
-    crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
+  return repeat_eagain([this, &key, &value] {
+    ++(shard_stats.repeat_io_num);
+
+    return transaction_manager->with_transaction_intr(
+      Transaction::src_t::MUTATE,
+      "write_meta",
+      [this, &key, &value](auto& t)
+    {
+      LOG_PREFIX(SeaStoreS::write_meta);
+      DEBUGT("key={} value={} ...", t, key, value);
+      return transaction_manager->update_root_meta(
+        t, key, value
+      ).si_then([this, &t] {
+        return transaction_manager->submit_transaction(t);
+      });
+    });
+  }).handle_error(
+    crimson::ct_error::assert_all{"Invalid error in SeaStoreS::write_meta"}
   ).finally([this] {
     assert(shard_stats.pending_io_num);
     --(shard_stats.pending_io_num);
@@ -2535,13 +2701,17 @@ seastar::future<> SeaStore::Shard::write_meta(
 seastar::future<std::tuple<int, std::string>>
 SeaStore::read_meta(const std::string& key)
 {
-  ceph_assert(seastar::this_shard_id() == primary_core);
   LOG_PREFIX(SeaStore::read_meta);
-  DEBUG("key: {}", key);
-  return mdstore->read_meta(key).safe_then([](auto v) {
+  DEBUG("key={} ...", key);
+
+  ceph_assert(seastar::this_shard_id() == primary_core);
+  return mdstore->read_meta(key
+  ).safe_then([key, FNAME](auto v) {
     if (v) {
+      DEBUG("key={}, value={}", key, *v);
       return std::make_tuple(0, std::move(*v));
     } else {
+      ERROR("key={} failed", key);
       return std::make_tuple(-1, std::string(""));
     }
   }).handle_error(
@@ -2598,7 +2768,7 @@ shard_stats_t SeaStore::Shard::get_io_stats(
   ret.minus(last_shard_stats);
 
   if (report_detail && seconds != 0) {
-    LOG_PREFIX(SeaStore::get_io_stats);
+    LOG_PREFIX(SeaStoreS::get_io_stats);
     auto calc_conflicts = [](uint64_t ios, uint64_t repeats) {
       return (double)(repeats-ios)/ios;
     };
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index fb495a422f6..185072744f2 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -35,14 +35,14 @@ using OnodeRef = boost::intrusive_ptr<Onode>;
 class TransactionManager;
 
 enum class op_type_t : uint8_t {
-    TRANSACTION = 0,
+    DO_TRANSACTION = 0,
     READ,
     WRITE,
     GET_ATTR,
     GET_ATTRS,
     STAT,
     OMAP_GET_VALUES,
-    OMAP_LIST,
+    OMAP_GET_VALUES2,
     MAX
 };
 
@@ -71,20 +71,19 @@ struct col_obj_ranges_t {
 
 class SeaStore final : public FuturizedStore {
 public:
+  using base_ertr = TransactionManager::base_ertr;
+  using base_iertr = TransactionManager::base_iertr;
+
   class MDStore {
   public:
-    using base_iertr = crimson::errorator<
-      crimson::ct_error::input_output_error
-    >;
-
-    using write_meta_ertr = base_iertr;
+    using write_meta_ertr = base_ertr;
     using write_meta_ret = write_meta_ertr::future<>;
     virtual write_meta_ret write_meta(
       const std::string &key,
       const std::string &val
     ) = 0;
 
-    using read_meta_ertr = base_iertr;
+    using read_meta_ertr = base_ertr;
     using read_meta_ret = write_meta_ertr::future<std::optional<std::string>>;
     virtual read_meta_ret read_meta(const std::string &key) = 0;
 
@@ -136,10 +135,7 @@ public:
       const omap_keys_t& keys) final;
 
     /// Retrieves paged set of values > start (if present)
-    using omap_get_values_ret_bare_t = std::tuple<bool, omap_values_t>;
-    using omap_get_values_ret_t = read_errorator::future<
-      omap_get_values_ret_bare_t>;
-    omap_get_values_ret_t omap_get_values(
+    read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
       const std::optional<std::string> &start ///< [in] start, empty for begin
@@ -170,7 +166,7 @@ public:
      * stages and locks as do_transaction. */
     seastar::future<> flush(CollectionRef ch) final;
 
-    read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
+    read_errorator::future<fiemap_ret_t> fiemap(
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
@@ -190,7 +186,6 @@ public:
       secondaries.emplace_back(&sec_dev);
     }
 
-    using coll_core_t = FuturizedStore::coll_core_t;
     seastar::future<std::vector<coll_core_t>> list_collections();
 
     seastar::future<> write_meta(const std::string& key,
@@ -305,18 +300,21 @@ public:
       auto begin_time = std::chrono::steady_clock::now();
       return seastar::do_with(
         oid, Ret{}, std::forward<F>(f),
-        [this, src, op_type, begin_time, tname
+        [this, ch, src, op_type, begin_time, tname
         ](auto &oid, auto &ret, auto &f)
       {
-        return repeat_eagain([&, this, src, tname] {
+        return repeat_eagain([&, this, ch, src, tname] {
           assert(src == Transaction::src_t::READ);
           ++(shard_stats.repeat_read_num);
 
           return transaction_manager->with_transaction_intr(
             src,
             tname,
-            [&, this](auto& t)
+            [&, this, ch, tname](auto& t)
           {
+            LOG_PREFIX(SeaStoreS::repeat_with_onode);
+            SUBDEBUGT(seastore, "{} cid={} oid={} ...",
+                      t, tname, ch->get_cid(), oid);
             return onode_manager->get_onode(t, oid
             ).si_then([&](auto onode) {
               return seastar::do_with(std::move(onode), [&](auto& onode) {
@@ -334,14 +332,16 @@ public:
       });
     }
 
-    using _fiemap_ret = ObjectDataHandler::fiemap_ret;
-    _fiemap_ret _fiemap(
-      Transaction &t,
-      Onode &onode,
-      uint64_t off,
-      uint64_t len) const;
+    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
+    using omap_list_ret = OMapManager::omap_list_ret;
+    omap_list_ret omap_list(
+      Onode& onode,
+      const omap_root_le_t& omap_root,
+      Transaction& t,
+      const std::optional<std::string>& start,
+      OMapManager::omap_list_config_t config) const;
 
-    using _omap_get_value_iertr = OMapManager::base_iertr::extend<
+    using _omap_get_value_iertr = base_iertr::extend<
       crimson::ct_error::enodata
       >;
     using _omap_get_value_ret = _omap_get_value_iertr::future<ceph::bufferlist>;
@@ -350,25 +350,51 @@ public:
       omap_root_t &&root,
       std::string_view key) const;
 
-    using _omap_get_values_iertr = OMapManager::base_iertr;
-    using _omap_get_values_ret = _omap_get_values_iertr::future<omap_values_t>;
-    _omap_get_values_ret _omap_get_values(
+    base_iertr::future<omap_values_t> _omap_get_values(
       Transaction &t,
       omap_root_t &&root,
       const omap_keys_t &keys) const;
 
     friend class SeaStoreOmapIterator;
 
-    using omap_list_bare_ret = OMapManager::omap_list_bare_ret;
-    using omap_list_ret = OMapManager::omap_list_ret;
-    omap_list_ret omap_list(
-      Onode &onode,
-      const omap_root_le_t& omap_root,
+    base_iertr::future<ceph::bufferlist> _read( 
       Transaction& t,
-      const std::optional<std::string>& start,
-      OMapManager::omap_list_config_t config) const;
+      Onode& onode,
+      uint64_t offset,
+      std::size_t len,
+      uint32_t op_flags);
+
+    _omap_get_value_ret _get_attr(
+      Transaction& t,
+      Onode& onode,
+      std::string_view name) const;
+
+    base_iertr::future<attrs_t> _get_attrs(
+      Transaction& t,
+      Onode& onode);
+
+    seastar::future<struct stat> _stat(
+      Transaction& t,
+      Onode& onode,
+      const ghobject_t& oid);
+
+    base_iertr::future<omap_values_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const omap_keys_t& keys);
 
-    using tm_iertr = TransactionManager::base_iertr;
+    base_iertr::future<omap_values_paged_t> do_omap_get_values(
+      Transaction& t,
+      Onode& onode,
+      const std::optional<std::string>& start);
+
+    base_iertr::future<fiemap_ret_t> _fiemap(
+      Transaction &t,
+      Onode &onode,
+      uint64_t off,
+      uint64_t len) const;
+
+    using tm_iertr = base_iertr;
     using tm_ret = tm_iertr::future<>;
     tm_ret _do_transaction_step(
       internal_context_t &ctx,
@@ -535,17 +561,7 @@ public:
     return shard_stores.local().get_fsid();
   }
 
-  seastar::future<> write_meta(
-    const std::string& key,
-    const std::string& value) final {
-    ceph_assert(seastar::this_shard_id() == primary_core);
-    return shard_stores.local().write_meta(
-      key, value).then([this, key, value] {
-      return mdstore->write_meta(key, value);
-    }).handle_error(
-      crimson::ct_error::assert_all{"Invalid error in SeaStore::write_meta"}
-    );
-  }
+  seastar::future<> write_meta(const std::string& key, const std::string& value) final;
 
   seastar::future<std::tuple<int, std::string>> read_meta(const std::string& key) final;
 
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index e1430b30019..f379dd0117c 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -54,7 +54,9 @@ std::ostream &operator<<(std::ostream &out, const device_id_printer_t &id)
   } else if (_id == DEVICE_ID_ROOT) {
     return out << "Dev(ROOT)";
   } else {
-    return out << "Dev(" << (unsigned)_id << ")";
+    return out << "Dev(0x"
+               << std::hex << (unsigned)_id << std::dec
+               << ")";
   }
 }
 
@@ -64,7 +66,7 @@ std::ostream &operator<<(std::ostream &out, const segment_id_t &segment)
     return out << "Seg[NULL]";
   } else {
     return out << "Seg[" << device_id_printer_t{segment.device_id()}
-               << "," << segment.device_segment_id()
+               << ",0x" << std::hex << segment.device_segment_id() << std::dec
                << "]";
   }
 }
@@ -93,12 +95,12 @@ std::ostream& operator<<(std::ostream& out, segment_seq_printer_t seq)
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_t &laddr) {
-  return out << 'L' << std::hex << laddr.value << std::dec;
+  return out << "L0x" << std::hex << laddr.value << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const laddr_offset_t &laddr_offset) {
   return out << laddr_offset.get_aligned_laddr()
-	     << "+" << std::hex << laddr_offset.get_offset() << std::dec;
+	     << "+0x" << std::hex << laddr_offset.get_offset() << std::dec;
 }
 
 std::ostream &operator<<(std::ostream &out, const pladdr_t &pladdr)
@@ -123,18 +125,18 @@ std::ostream &operator<<(std::ostream &out, const paddr_t &rhs)
   } else if (has_device_off(id)) {
     auto &s = rhs.as_res_paddr();
     out << device_id_printer_t{id}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::SEGMENT) {
     auto &s = rhs.as_seg_paddr();
     out << s.get_segment_id()
-        << ","
-        << s.get_segment_off();
+        << ",0x"
+        << std::hex << s.get_segment_off() << std::dec;
   } else if (rhs.get_addr_type() == paddr_types_t::RANDOM_BLOCK) {
     auto &s = rhs.as_blk_paddr();
     out << device_id_printer_t{s.get_device_id()}
-        << ","
-        << s.get_device_off();
+        << ",0x"
+        << std::hex << s.get_device_off() << std::dec;
   } else {
     out << "INVALID!";
   }
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 52515937a9e..5d8ad00ba22 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -80,6 +80,11 @@ struct rewrite_stats_t {
   }
 };
 
+struct rbm_pending_ool_t {
+  bool is_conflicted = false;
+  std::list<CachedExtentRef> pending_extents;
+};
+
 /**
  * Transaction
  *
@@ -554,6 +559,18 @@ public:
     return static_cast<T&>(*view);
   }
 
+  void set_pending_ool(seastar::lw_shared_ptr<rbm_pending_ool_t> ptr) {
+    pending_ool = ptr;
+  }
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> get_pending_ool() {
+    return pending_ool;
+  }
+
+  const auto& get_pre_alloc_list() {
+    return pre_alloc_list;
+  }
+
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -650,6 +667,8 @@ private:
   const src_t src;
 
   transaction_id_t trans_id = TRANS_ID_NULL;
+
+  seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
 };
 using TransactionRef = Transaction::Ref;
 
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index a76b7fbe0c9..f4e3b0858f2 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -48,7 +48,7 @@ TransactionManager::TransactionManager(
 TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
 {
   LOG_PREFIX(TransactionManager::mkfs);
-  INFO("enter");
+  INFO("...");
   return epm->mount(
   ).safe_then([this] {
     return journal->open_for_mkfs();
@@ -94,14 +94,15 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
   }).safe_then([this] {
     return close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
   });
 }
 
-TransactionManager::mount_ertr::future<> TransactionManager::mount()
+TransactionManager::mount_ertr::future<>
+TransactionManager::mount()
 {
   LOG_PREFIX(TransactionManager::mount);
-  INFO("enter");
+  INFO("...");
   cache->init();
   return epm->mount(
   ).safe_then([this] {
@@ -168,16 +169,17 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount()
     return epm->open_for_write();
   }).safe_then([FNAME, this] {
     epm->start_background();
-    INFO("completed");
+    INFO("done");
   }).handle_error(
     mount_ertr::pass_further{},
     crimson::ct_error::assert_all{"unhandled error"}
   );
 }
 
-TransactionManager::close_ertr::future<> TransactionManager::close() {
+TransactionManager::close_ertr::future<>
+TransactionManager::close() {
   LOG_PREFIX(TransactionManager::close);
-  INFO("enter");
+  INFO("...");
   return epm->stop_background(
   ).then([this] {
     return cache->close();
@@ -187,7 +189,7 @@ TransactionManager::close_ertr::future<> TransactionManager::close() {
   }).safe_then([this] {
     return epm->close();
   }).safe_then([FNAME] {
-    INFO("completed");
+    INFO("done");
     return seastar::now();
   });
 }
@@ -229,28 +231,26 @@ TransactionManager::ref_ret TransactionManager::remove(
   LogicalCachedExtentRef &ref)
 {
   LOG_PREFIX(TransactionManager::remove);
-  TRACET("{}", t, *ref);
+  DEBUGT("{} ...", t, *ref);
   return lba_manager->decref_extent(t, ref->get_laddr()
   ).si_then([this, FNAME, &t, ref](auto result) {
-    DEBUGT("extent refcount is decremented to {} -- {}",
-           t, result.refcount, *ref);
     if (result.refcount == 0) {
       cache->retire_extent(t, ref);
     }
+    DEBUGT("removed {}~0x{:x} refcount={} -- {}",
+           t, result.addr, result.length, result.refcount, *ref);
     return result.refcount;
   });
 }
 
-TransactionManager::ref_ret TransactionManager::_dec_ref(
+TransactionManager::ref_ret TransactionManager::remove(
   Transaction &t,
   laddr_t offset)
 {
-  LOG_PREFIX(TransactionManager::_dec_ref);
-  TRACET("{}", t, offset);
+  LOG_PREFIX(TransactionManager::remove);
+  DEBUGT("{} ...", t, offset);
   return lba_manager->decref_extent(t, offset
   ).si_then([this, FNAME, offset, &t](auto result) -> ref_ret {
-    DEBUGT("extent refcount is decremented to {} -- {}~{}, {}",
-           t, result.refcount, offset, result.length, result.addr);
     auto fut = ref_iertr::now();
     if (result.refcount == 0) {
       if (result.addr.is_paddr() &&
@@ -259,8 +259,9 @@ TransactionManager::ref_ret TransactionManager::_dec_ref(
           t, result.addr.get_paddr(), result.length);
       }
     }
-
-    return fut.si_then([result=std::move(result)] {
+    return fut.si_then([result=std::move(result), offset, &t, FNAME] {
+      DEBUGT("removed {}~0x{:x} refcount={} -- offset={}",
+             t, result.addr, result.length, result.refcount, offset);
       return result.refcount;
     });
   });
@@ -271,19 +272,21 @@ TransactionManager::refs_ret TransactionManager::remove(
   std::vector<laddr_t> offsets)
 {
   LOG_PREFIX(TransactionManager::remove);
-  DEBUG("{} offsets", offsets.size());
+  DEBUGT("{} offsets ...", t, offsets.size());
   return seastar::do_with(std::move(offsets), std::vector<unsigned>(),
-      [this, &t] (auto &&offsets, auto &refcnt) {
-      return trans_intr::do_for_each(offsets.begin(), offsets.end(),
-        [this, &t, &refcnt] (auto &laddr) {
-        return this->remove(t, laddr).si_then([&refcnt] (auto ref) {
-          refcnt.push_back(ref);
-          return ref_iertr::now();
-        });
-      }).si_then([&refcnt] {
-        return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnt));
+    [this, &t, FNAME](auto &&offsets, auto &refcnts) {
+    return trans_intr::do_for_each(offsets.begin(), offsets.end(),
+      [this, &t, &refcnts](auto &laddr) {
+      return this->remove(t, laddr
+      ).si_then([&refcnts](auto ref) {
+        refcnts.push_back(ref);
+        return ref_iertr::now();
       });
+    }).si_then([&refcnts, &t, FNAME] {
+      DEBUGT("removed {} offsets", t, refcnts.size());
+      return ref_iertr::make_ready_future<std::vector<unsigned>>(std::move(refcnts));
     });
+  });
 }
 
 TransactionManager::submit_transaction_iertr::future<>
@@ -340,6 +343,7 @@ TransactionManager::update_lba_mappings(
         return;
       }
       if (extent->is_logical()) {
+        assert(is_logical_type(extent->get_type()));
         // for rewritten extents, last_committed_crc should have been set
         // because the crc of the original extent may be reused.
         // also see rewrite_logical_extent()
@@ -359,6 +363,7 @@ TransactionManager::update_lba_mappings(
 #endif
         lextents.emplace_back(extent->template cast<LogicalCachedExtent>());
       } else {
+        assert(is_physical_type(extent->get_type()));
         pextents.emplace_back(extent);
       }
     };
@@ -515,7 +520,6 @@ TransactionManager::rewrite_logical_extent(
     ERRORT("extent has been invalidated -- {}", t, *extent);
     ceph_abort();
   }
-  TRACET("rewriting extent -- {}", t, *extent);
 
   auto lextent = extent->cast<LogicalCachedExtent>();
   cache->retire_extent(t, extent);
@@ -529,7 +533,7 @@ TransactionManager::rewrite_logical_extent(
       lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
     nlextent->rewrite(t, *lextent, 0);
 
-    DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+    DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent);
 
 #ifndef NDEBUG
     if (get_checksum_needed(lextent->get_paddr())) {
@@ -566,16 +570,16 @@ TransactionManager::rewrite_logical_extent(
       0,
       lextent->get_length(),
       extent_ref_count_t(0),
-      [this, lextent, &t](auto &extents, auto &off, auto &left, auto &refcount) {
+      [this, FNAME, lextent, &t]
+      (auto &extents, auto &off, auto &left, auto &refcount) {
       return trans_intr::do_for_each(
         extents,
-        [lextent, this, &t, &off, &left, &refcount](auto &nextent) {
-        LOG_PREFIX(TransactionManager::rewrite_logical_extent);
+        [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) {
         bool first_extent = (off == 0);
         ceph_assert(left >= nextent->get_length());
         auto nlextent = nextent->template cast<LogicalCachedExtent>();
         nlextent->rewrite(t, *lextent, off);
-        DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent);
+        DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent);
 
         /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
          * extents since we're going to do it again once we either do the ool write
@@ -629,10 +633,18 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
   {
     auto updated = cache->update_extent_from_transaction(t, extent);
     if (!updated) {
-      DEBUGT("extent is already retired, skipping -- {}", t, *extent);
+      DEBUGT("target={} {} already retired, skipping -- {}", t,
+             rewrite_gen_printer_t{target_generation},
+             sea_time_point_printer_t{modify_time},
+             *extent);
       return rewrite_extent_iertr::now();
     }
+
     extent = updated;
+    DEBUGT("target={} {} -- {} ...", t,
+           rewrite_gen_printer_t{target_generation},
+           sea_time_point_printer_t{modify_time},
+           *extent);
     ceph_assert(!extent->is_pending_io());
   }
 
@@ -650,9 +662,9 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
       // FIXME: is_dirty() is true for mutation pending extents
       // which shouldn't do inplace rewrite because a pending transaction
       // may fail.
-      DEBUGT("delta overwriting extent -- {}", t, *extent);
       t.add_inplace_rewrite_extent(extent);
       extent->set_inplace_rewrite_generation();
+      DEBUGT("rewritten as inplace rewrite -- {}", t, *extent);
       return rewrite_extent_iertr::now();
     }
     extent->set_target_rewrite_generation(INIT_GENERATION);
@@ -665,23 +677,25 @@ TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent(
     t.get_rewrite_stats().account_n_dirty();
   }
 
-  if (is_backref_node(extent->get_type())) {
-    DEBUGT("rewriting backref extent -- {}", t, *extent);
-    return backref_manager->rewrite_extent(t, extent);
-  }
-
   if (is_root_type(extent->get_type())) {
-    DEBUGT("rewriting root extent -- {}", t, *extent);
     cache->duplicate_for_write(t, extent);
+    DEBUGT("rewritten root {}", t, *extent);
     return rewrite_extent_iertr::now();
   }
 
+  auto fut = rewrite_extent_iertr::now();
   if (extent->is_logical()) {
-    return rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+    assert(is_logical_type(extent->get_type()));
+    fut = rewrite_logical_extent(t, extent->cast<LogicalCachedExtent>());
+  } else if (is_backref_node(extent->get_type())) {
+    fut = backref_manager->rewrite_extent(t, extent);
   } else {
-    DEBUGT("rewriting physical extent -- {}", t, *extent);
-    return lba_manager->rewrite_extent(t, extent);
+    assert(is_lba_node(extent->get_type()));
+    fut = lba_manager->rewrite_extent(t, extent);
   }
+  return fut.si_then([FNAME, &t] {
+    DEBUGT("rewritten", t);
+  });
 }
 
 TransactionManager::get_extents_if_live_ret
@@ -693,7 +707,7 @@ TransactionManager::get_extents_if_live(
   extent_len_t len)
 {
   LOG_PREFIX(TransactionManager::get_extents_if_live);
-  TRACET("{} {}~{} {}", t, type, laddr, len, paddr);
+  DEBUGT("{} {}~0x{:x} {} ...", t, type, laddr, len, paddr);
 
   // This only works with segments to check if alive,
   // as parallel transactions may split the extent at the same time.
@@ -703,7 +717,7 @@ TransactionManager::get_extents_if_live(
   ).si_then([=, this, &t](auto extent)
 	    -> get_extents_if_live_ret {
     if (extent && extent->get_length() == len) {
-      DEBUGT("{} {}~{} {} is live in cache -- {}",
+      DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}",
              t, type, laddr, len, paddr, *extent);
       std::list<CachedExtentRef> res;
       res.emplace_back(std::move(extent));
@@ -757,7 +771,9 @@ TransactionManager::get_extents_if_live(
               list.emplace_back(std::move(ret));
               return seastar::now();
             });
-          }).si_then([&list] {
+          }).si_then([&list, &t, FNAME, type, laddr, len, paddr] {
+            DEBUGT("{} {}~0x{:x} {} is alive as {} extents",
+                   t, type, laddr, len, paddr, list.size());
             return get_extents_if_live_ret(
               interruptible::ready_future_marker{},
               std::move(list));
@@ -778,11 +794,11 @@ TransactionManager::get_extents_if_live(
       ).si_then([=, &t](auto ret) {
         std::list<CachedExtentRef> res;
         if (ret) {
-          DEBUGT("{} {}~{} {} is live as physical extent -- {}",
+          DEBUGT("{} {}~0x{:x} {} is absent and alive as physical extent -- {}",
                  t, type, laddr, len, paddr, *ret);
           res.emplace_back(std::move(ret));
         } else {
-          DEBUGT("{} {}~{} {} is not live as physical extent",
+          DEBUGT("{} {}~0x{:x} {} is not alive as physical extent",
                  t, type, laddr, len, paddr);
         }
         return get_extents_if_live_ret(
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 828b8a25592..c7a94a9ef11 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -106,8 +106,12 @@ public:
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::get_pin);
-    SUBTRACET(seastore_tm, "{}", t, offset);
-    return lba_manager->get_mapping(t, offset);
+    SUBDEBUGT(seastore_tm, "{} ...", t, offset);
+    return lba_manager->get_mapping(t, offset
+    ).si_then([FNAME, &t](LBAMappingRef pin) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *pin);
+      return pin;
+    });
   }
 
   /**
@@ -122,9 +126,13 @@ public:
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::get_pins);
-    SUBDEBUGT(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, offset, length);
     return lba_manager->get_mappings(
-      t, offset, length);
+      t, offset, length
+    ).si_then([FNAME, &t](lba_pin_list_t pins) {
+      SUBDEBUGT(seastore_tm, "got {} pins", t, pins.size());
+      return pins;
+    });
   }
 
   /**
@@ -142,15 +150,15 @@ public:
     laddr_t offset,
     extent_len_t length) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}~{}", t, offset, length);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} {} ...",
+              t, offset, length, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset, length] (auto pin)
       -> read_extent_ret<T> {
       if (length != pin->get_length() || !pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} len {} got wrong pin {}",
-            t, offset, length, *pin);
+        SUBERRORT(seastore_tm, "{}~0x{:x} {} got wrong {}",
+                  t, offset, length, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -167,15 +175,15 @@ public:
     Transaction &t,
     laddr_t offset) {
     LOG_PREFIX(TransactionManager::read_extent);
-    SUBTRACET(seastore_tm, "{}", t, offset);
+    SUBDEBUGT(seastore_tm, "{} {} ...",
+              t, offset, T::TYPE);
     return get_pin(
       t, offset
     ).si_then([this, FNAME, &t, offset] (auto pin)
       -> read_extent_ret<T> {
       if (!pin->get_val().is_real()) {
-        SUBERRORT(seastore_tm,
-            "offset {} got wrong pin {}",
-            t, offset, *pin);
+        SUBERRORT(seastore_tm, "{} {} got wrong {}",
+                  t, offset, T::TYPE, *pin);
         ceph_assert(0 == "Should be impossible");
       }
       return this->read_pin<T>(t, std::move(pin));
@@ -187,6 +195,8 @@ public:
     Transaction &t,
     LBAMappingRef pin)
   {
+    LOG_PREFIX(TransactionManager::read_pin);
+    SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
     auto fut = base_iertr::make_ready_future<LBAMappingRef>();
     if (!pin->is_parent_viewable()) {
       if (pin->is_parent_valid()) {
@@ -212,52 +222,12 @@ public:
       } else {
 	return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
       }
+    }).si_then([FNAME, &t](TCachedExtentRef<T> ext) {
+      SUBDEBUGT(seastore_tm, "got {}", t, *ext);
+      return ext;
     });
   }
 
-  template <typename T>
-  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
-  get_extent_if_linked(
-    Transaction &t,
-    LBAMappingRef pin)
-  {
-    ceph_assert(pin->is_parent_viewable());
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    auto v = pin->get_logical_extent(t);
-    if (v.has_child()) {
-      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
-#ifndef NDEBUG
-        auto lextent = extent->template cast<LogicalCachedExtent>();
-        auto pin_laddr = pin->get_key();
-        if (pin->is_indirect()) {
-          pin_laddr = pin->get_intermediate_base();
-        }
-        assert(lextent->get_laddr() == pin_laddr);
-#endif
-	return extent->template cast<T>();
-      });
-    } else {
-      return pin;
-    }
-  }
-
-  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
-    Transaction &t,
-    LBAMappingRef pin,
-    extent_types_t type)
-  {
-    ceph_assert(!pin->parent_modified());
-    auto v = pin->get_logical_extent(t);
-    // checking the lba child must be atomic with creating
-    // and linking the absent child
-    if (v.has_child()) {
-      return std::move(v.get_child_fut());
-    } else {
-      return pin_to_extent_by_type(t, std::move(pin), type);
-    }
-  }
-
   /// Obtain mutable copy of extent
   LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
     LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -265,24 +235,15 @@ public:
       t,
       ref)->cast<LogicalCachedExtent>();
     if (!ret->has_laddr()) {
-      SUBDEBUGT(seastore_tm,
-	"duplicating extent for write -- {} -> {}",
-	t,
-	*ref,
-	*ret);
+      SUBDEBUGT(seastore_tm, "duplicate from {}", t, *ref);
       ret->set_laddr(ref->get_laddr());
     } else {
-      SUBTRACET(seastore_tm,
-	"extent is already duplicated -- {}",
-	t,
-	*ref);
       assert(ref->is_mutable());
       assert(&*ref == &*ret);
     }
     return ret;
   }
 
-
   using ref_iertr = LBAManager::ref_iertr;
   using ref_ret = ref_iertr::future<extent_ref_count_t>;
 
@@ -302,26 +263,15 @@ public:
    * remove
    *
    * Remove the extent and the corresponding lba mapping,
-   * users must make sure that lba mapping's refcount is 1
+   * users must make sure that lba mapping's refcount > 1
    */
   ref_ret remove(
     Transaction &t,
     LogicalCachedExtentRef &ref);
 
-  /**
-   * remove
-   *
-   * 1. Remove the indirect mapping(s), and if refcount drops to 0,
-   *    also remove the direct mapping and retire the extent.
-   * 
-   * 2. Remove the direct mapping(s) and retire the extent if
-   * 	refcount drops to 0.
-   */
   ref_ret remove(
     Transaction &t,
-    laddr_t offset) {
-    return _dec_ref(t, offset);
-  }
+    laddr_t offset);
 
   /// remove refcount for list of offset
   using refs_ret = ref_iertr::future<std::vector<unsigned>>;
@@ -346,23 +296,23 @@ public:
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto ext = cache->alloc_new_non_data_extent<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (!ext) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extent(
       t,
       laddr_hint,
       *ext
-    ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable {
-      LOG_PREFIX(TransactionManager::alloc_non_data_extent);
-      SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint);
+    ).si_then([ext=std::move(ext), &t, FNAME](auto &&) mutable {
+      SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       return alloc_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -385,14 +335,15 @@ public:
     extent_len_t len,
     placement_hint_t placement_hint = placement_hint_t::HOT) {
     LOG_PREFIX(TransactionManager::alloc_data_extents);
-    SUBTRACET(seastore_tm, "{} len={}, placement_hint={}, laddr_hint={}",
-              t, T::TYPE, len, placement_hint, laddr_hint);
+    SUBDEBUGT(seastore_tm, "{} hint {}~0x{:x} phint={} ...",
+              t, T::TYPE, laddr_hint, len, placement_hint);
     auto exts = cache->alloc_new_data_extents<T>(
       t,
       len,
       placement_hint,
       INIT_GENERATION);
     if (exts.empty()) {
+      SUBERRORT(seastore_tm, "insufficient space!", t);
       return crimson::ct_error::enospc::make();
     }
     return lba_manager->alloc_extents(
@@ -403,7 +354,7 @@ public:
       EXTENT_DEFAULT_REF_COUNT
     ).si_then([exts=std::move(exts), &t, FNAME](auto &&) mutable {
       for (auto &ext : exts) {
-	SUBDEBUGT(seastore_tm, "new extent: {}", t, *ext);
+	SUBDEBUGT(seastore_tm, "allocated {}", t, *ext);
       }
       return alloc_extent_iertr::make_ready_future<
 	std::vector<TCachedExtentRef<T>>>(std::move(exts));
@@ -411,15 +362,21 @@ public:
   }
 
   template <typename T>
-  read_extent_ret<T> get_mutable_extent_by_laddr(Transaction &t, laddr_t laddr, extent_len_t len) {
+  read_extent_ret<T> get_mutable_extent_by_laddr(
+      Transaction &t,
+      laddr_t laddr,
+      extent_len_t len) {
+    LOG_PREFIX(TransactionManager::get_mutable_extent_by_laddr);
+    SUBDEBUGT(seastore_tm, "{}~0x{:x} ...", t, laddr, len);
     return get_pin(t, laddr
     ).si_then([this, &t, len](auto pin) {
       ceph_assert(pin->is_data_stable() && !pin->is_zero_reserved());
       ceph_assert(!pin->is_clone());
       ceph_assert(pin->get_length() == len);
       return this->read_pin<T>(t, std::move(pin));
-    }).si_then([this, &t](auto extent) {
+    }).si_then([this, &t, FNAME](auto extent) {
       auto ext = get_mutable_extent(t, extent)->template cast<T>();
+      SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext);
       return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
 	std::move(ext));
     });
@@ -476,10 +433,8 @@ public:
       extent_len_t original_len = pin->get_length();
       paddr_t original_paddr = pin->get_val();
       LOG_PREFIX(TransactionManager::remap_pin);
-      SUBDEBUGT(seastore_tm,
-	"original laddr: {}, original paddr: {}, original length: {},"
-	" remap to {} extents",
-	t, original_laddr, original_paddr, original_len, remaps.size());
+      SUBDEBUGT(seastore_tm, "{}~0x{:x} {} into {} remaps ... {}",
+                t, original_laddr, original_len, original_paddr, remaps.size(), *pin);
       // The according extent might be stable or pending.
       auto fut = base_iertr::now();
       if (!pin->is_indirect()) {
@@ -536,14 +491,13 @@ public:
 	    auto remap_len = remap.len;
 	    auto remap_laddr = (original_laddr + remap_offset).checked_to_laddr();
 	    auto remap_paddr = original_paddr.add_offset(remap_offset);
+	    SUBDEBUGT(seastore_tm, "remap direct pin into {}~0x{:x} {} ...",
+	              t, remap_laddr, remap_len, remap_paddr);
 	    ceph_assert(remap_len < original_len);
 	    ceph_assert(remap_offset + remap_len <= original_len);
 	    ceph_assert(remap_len != 0);
 	    ceph_assert(remap_offset % cache->get_block_size() == 0);
 	    ceph_assert(remap_len % cache->get_block_size() == 0);
-	    SUBDEBUGT(seastore_tm,
-	      "remap laddr: {}, remap paddr: {}, remap length: {}", t,
-	      remap_laddr, remap_paddr, remap_len);
 	    auto extent = cache->alloc_remapped_extent<T>(
 	      t,
 	      remap_laddr,
@@ -555,13 +509,15 @@ public:
 	  }
 	});
       }
-      return fut.si_then([this, &t, &pin, &remaps, &extents] {
+      return fut.si_then([this, &t, &pin, &remaps, &extents, FNAME] {
 	return lba_manager->remap_mappings(
 	  t,
 	  std::move(pin),
 	  std::vector<remap_entry>(remaps.begin(), remaps.end()),
 	  std::move(extents)
-	).si_then([](auto ret) {
+	).si_then([FNAME, &t](auto ret) {
+	  SUBDEBUGT(seastore_tm, "remapped {} pins",
+	            t, ret.remapped_mappings.size());
 	  return Cache::retire_extent_iertr::make_ready_future<
 	    std::vector<LBAMappingRef>>(std::move(ret.remapped_mappings));
 	});
@@ -581,11 +537,15 @@ public:
     laddr_t hint,
     extent_len_t len) {
     LOG_PREFIX(TransactionManager::reserve_region);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}", t, len, hint);
+    SUBDEBUGT(seastore_tm, "hint {}~0x{:x} ...", t, hint, len);
     return lba_manager->reserve_region(
       t,
       hint,
-      len);
+      len
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "reserved {}", t, *pin);
+      return pin;
+    });
   }
 
   /*
@@ -612,15 +572,17 @@ public:
         : mapping.get_key();
 
     LOG_PREFIX(TransactionManager::clone_pin);
-    SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, clone_offset {}",
-      t, mapping.get_length(), hint, intermediate_key);
+    SUBDEBUGT(seastore_tm, "{} clone to hint {} ...", t, mapping, hint);
     return lba_manager->clone_mapping(
       t,
       hint,
       mapping.get_length(),
       intermediate_key,
       intermediate_base
-    );
+    ).si_then([FNAME, &t](auto pin) {
+      SUBDEBUGT(seastore_tm, "cloned as {}", t, *pin);
+      return pin;
+    });
   }
 
   /* alloc_extents
@@ -635,10 +597,10 @@ public:
      extent_len_t len,
      int num) {
      LOG_PREFIX(TransactionManager::alloc_extents);
-     SUBDEBUGT(seastore_tm, "len={}, laddr_hint={}, num={}",
-               t, len, hint, num);
+     SUBDEBUGT(seastore_tm, "hint {}~({} * 0x{:x}) ...",
+               t, hint, num, len);
      return seastar::do_with(std::vector<TCachedExtentRef<T>>(),
-       [this, &t, hint, len, num] (auto &extents) {
+       [this, &t, hint, len, num, FNAME](auto &extents) {
        return trans_intr::do_for_each(
                        boost::make_counting_iterator(0),
                        boost::make_counting_iterator(num),
@@ -647,7 +609,8 @@ public:
            [&extents](auto &&node) {
            extents.push_back(node);
          });
-       }).si_then([&extents] {
+       }).si_then([&extents, &t, FNAME] {
+         SUBDEBUGT(seastore_tm, "allocated {} extents", t, extents.size());
          return alloc_extents_iertr::make_ready_future
                 <std::vector<TCachedExtentRef<T>>>(std::move(extents));
        });
@@ -753,7 +716,7 @@ public:
     const std::string& key,
     const std::string& value) {
     LOG_PREFIX(TransactionManager::update_root_meta);
-    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {}", t, key, value);
+    SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
     return cache->get_root(
       t
     ).si_then([this, &t, &key, &value](RootBlockRef root) {
@@ -808,7 +771,7 @@ public:
     return cache->get_root(t).si_then([&t](auto croot) {
       LOG_PREFIX(TransactionManager::read_collection_root);
       auto ret = croot->get_root().collection_root.get();
-      SUBTRACET(seastore_tm, "{}~{}",
+      SUBTRACET(seastore_tm, "{}~0x{:x}",
                 t, ret.get_location(), ret.get_size());
       return ret;
     });
@@ -821,7 +784,7 @@ public:
    */
   void write_collection_root(Transaction &t, coll_root_t cmroot) {
     LOG_PREFIX(TransactionManager::write_collection_root);
-    SUBDEBUGT(seastore_tm, "{}~{}",
+    SUBDEBUGT(seastore_tm, "{}~0x{:x}",
               t, cmroot.get_location(), cmroot.get_size());
     auto croot = cache->get_root_fast(t);
     croot = cache->duplicate_for_write(t, croot)->cast<RootBlock>();
@@ -853,6 +816,49 @@ private:
 
   shard_stats_t& shard_stats;
 
+  template <typename T>
+  std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
+  get_extent_if_linked(
+    Transaction &t,
+    LBAMappingRef pin)
+  {
+    ceph_assert(pin->is_parent_viewable());
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    auto v = pin->get_logical_extent(t);
+    if (v.has_child()) {
+      return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+#ifndef NDEBUG
+        auto lextent = extent->template cast<LogicalCachedExtent>();
+        auto pin_laddr = pin->get_key();
+        if (pin->is_indirect()) {
+          pin_laddr = pin->get_intermediate_base();
+        }
+        assert(lextent->get_laddr() == pin_laddr);
+#endif
+	return extent->template cast<T>();
+      });
+    } else {
+      return pin;
+    }
+  }
+
+  base_iertr::future<LogicalCachedExtentRef> read_pin_by_type(
+    Transaction &t,
+    LBAMappingRef pin,
+    extent_types_t type)
+  {
+    ceph_assert(!pin->parent_modified());
+    auto v = pin->get_logical_extent(t);
+    // checking the lba child must be atomic with creating
+    // and linking the absent child
+    if (v.has_child()) {
+      return std::move(v.get_child_fut());
+    } else {
+      return pin_to_extent_by_type(t, std::move(pin), type);
+    }
+  }
+
   rewrite_extent_ret rewrite_logical_extent(
     Transaction& t,
     LogicalCachedExtentRef extent);
@@ -862,11 +868,6 @@ private:
     ExtentPlacementManager::dispatch_result_t dispatch_result,
     std::optional<journal_seq_t> seq_to_trim = std::nullopt);
 
-  /// Remove refcount for offset
-  ref_ret _dec_ref(
-    Transaction &t,
-    laddr_t offset);
-
   using update_lba_mappings_ret = LBAManager::update_mappings_ret;
   update_lba_mappings_ret update_lba_mappings(
     Transaction &t,
@@ -886,7 +887,7 @@ private:
     Transaction &t,
     LBAMappingRef pin) {
     LOG_PREFIX(TransactionManager::pin_to_extent);
-    SUBTRACET(seastore_tm, "getting extent {}", t, *pin);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
     static_assert(is_logical_type(T::TYPE));
     using ret = pin_to_extent_ret<T>;
     auto &pref = *pin;
@@ -950,7 +951,8 @@ private:
       extent_types_t type)
   {
     LOG_PREFIX(TransactionManager::pin_to_extent_by_type);
-    SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type);
+    SUBTRACET(seastore_tm, "getting absent extent from pin {} type {} ...",
+              t, *pin, type);
     assert(is_logical_type(type));
     auto &pref = *pin;
     return cache->get_absent_extent_by_type(
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 683dc6ea649..522a93a1ddc 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -52,6 +52,12 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
     return peering_state.is_backfilling();
   }
 
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {
+    return peering_state.prepare_backfill_for_missing(soid, v, peers);
+  }
   PeeringFacade(PeeringState& peering_state)
     : peering_state(peering_state) {
   }
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 70c43f49faf..018e58b68f8 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -225,7 +225,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
   const BackfillInterval& backfill_info) const
 {
   return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
-	 !backfill_info.extends_to_end();
+	 !backfill_info.extends_to_end() && backfill_info.empty();
 }
 
 void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
@@ -266,6 +266,7 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
   logger().debug("{}: check={}", __func__, check);
   const auto& primary_bi = backfill_state().backfill_info;
   result_t result { {}, primary_bi.begin };
+  std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
 
   for (const auto& bt : peering_state().get_backfill_targets()) {
     const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
@@ -273,9 +274,13 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
     // Find all check peers that have the wrong version
     if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
         check == primary_bi.begin && check == peer_bi.begin) {
-      if(peer_bi.objects.begin()->second != obj_v &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (peer_bi.objects.begin()->second != obj_v) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       } else {
         // it's fine, keep it! OR already recovering
       }
@@ -284,12 +289,22 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
       // Only include peers that we've caught up to their backfill line
       // otherwise, they only appear to be missing this object
       // because their peer_bi.begin > backfill_info.begin.
-      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt) &&
-          backfill_state().progress_tracker->enqueue_push(primary_bi.begin)) {
-        backfill_listener().enqueue_push(primary_bi.begin, obj_v);
+      if (primary_bi.begin > peering_state().get_peer_last_backfill(bt)) {
+	std::ignore = backfill_state().progress_tracker->enqueue_push(
+	  primary_bi.begin);
+	auto &[v, peers] = backfills[primary_bi.begin];
+	assert(v == obj_v || v == eversion_t());
+	v = obj_v;
+	peers.push_back(bt);
       }
     }
   }
+  for (auto &backfill : backfills) {
+    auto &soid = backfill.first;
+    auto &obj_v = backfill.second.first;
+    auto &peers = backfill.second.second;
+    backfill_listener().enqueue_push(soid, obj_v, peers);
+  }
   return result;
 }
 
@@ -327,16 +342,29 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
   }
   trim_backfill_infos();
 
-  while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+  if (should_rescan_primary(backfill_state().peer_backfill_info,
+				   primary_bi)) {
+    // need to grab one another chunk of the object namespace and restart
+    // the queueing.
+    logger().debug("{}: reached end for current local chunk", __func__);
+    post_event(RequestPrimaryScanning{});
+    return;
+  }
+
+  do {
     if (!backfill_listener().budget_available()) {
       post_event(RequestWaiting{});
       return;
     } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
-                                      primary_bi)) {
+				      primary_bi)) {
       // Count simultaneous scans as a single op and let those complete
       post_event(RequestReplicasScanning{});
       return;
     }
+
+    if (all_emptied(primary_bi, backfill_state().peer_backfill_info)) {
+      break;
+    }
     // Get object within set of peers to operate on and the set of targets
     // for which that object applies.
     if (const hobject_t check = \
@@ -355,30 +383,23 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
       trim_backfilled_object_from_intervals(std::move(result),
 					    backfill_state().last_backfill_started,
 					    backfill_state().peer_backfill_info);
-      primary_bi.pop_front();
+      if (!primary_bi.empty()) {
+	primary_bi.pop_front();
+      }
     }
     backfill_listener().maybe_flush();
-  }
+  } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
-  if (should_rescan_primary(backfill_state().peer_backfill_info,
-                            primary_bi)) {
-    // need to grab one another chunk of the object namespace and restart
-    // the queueing.
-    logger().debug("{}: reached end for current local chunk",
-                   __func__);
-    post_event(RequestPrimaryScanning{});
-  } else {
-    if (backfill_state().progress_tracker->tracked_objects_completed()
-	&& Enqueuing::all_enqueued(peering_state(),
-				   backfill_state().backfill_info,
-				   backfill_state().peer_backfill_info)) {
-      backfill_state().last_backfill_started = hobject_t::get_max();
-      backfill_listener().update_peers_last_backfill(hobject_t::get_max());
-    }
-    logger().debug("{}: reached end for both local and all peers "
-                   "but still has in-flight operations", __func__);
-    post_event(RequestWaiting{});
+  if (backfill_state().progress_tracker->tracked_objects_completed()
+      && Enqueuing::all_enqueued(peering_state(),
+				 backfill_state().backfill_info,
+				 backfill_state().peer_backfill_info)) {
+    backfill_state().last_backfill_started = hobject_t::get_max();
+    backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   }
+  logger().debug("{}: reached end for both local and all peers "
+		 "but still has in-flight operations", __func__);
+  post_event(RequestWaiting{});
 }
 
 // -- PrimaryScanning
@@ -403,7 +424,7 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
 {
   logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -480,7 +501,7 @@ BackfillState::ReplicasScanning::react(ObjectPushed evt)
 {
   logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
 
@@ -496,16 +517,8 @@ BackfillState::Waiting::react(ObjectPushed evt)
 {
   logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
                  evt.object);
-  backfill_state().progress_tracker->complete_to(evt.object, evt.stat);
-  if (!Enqueuing::all_enqueued(peering_state(),
-                               backfill_state().backfill_info,
-                               backfill_state().peer_backfill_info)) {
-    return transit<Enqueuing>();
-  } else {
-    // we still have something to wait on
-    logger().debug("Waiting::react() on ObjectPushed; still waiting");
-    return discard_event();
-  }
+  backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
+  return transit<Enqueuing>();;
 }
 
 // -- Done
@@ -559,7 +572,8 @@ void BackfillState::ProgressTracker::enqueue_drop(const hobject_t& obj)
 
 void BackfillState::ProgressTracker::complete_to(
   const hobject_t& obj,
-  const pg_stat_t& stats)
+  const pg_stat_t& stats,
+  bool may_push_to_max)
 {
   logger().debug("{}: obj={}",
                  __func__, obj);
@@ -570,6 +584,7 @@ void BackfillState::ProgressTracker::complete_to(
   } else {
     ceph_abort_msg("completing untracked object shall not happen");
   }
+  auto new_last_backfill = peering_state().earliest_backfill();
   for (auto it = std::begin(registry);
        it != std::end(registry) &&
          it->second.stage != op_stage_t::enqueued_push;
@@ -579,15 +594,18 @@ void BackfillState::ProgressTracker::complete_to(
     peering_state().update_complete_backfill_object_stats(
       soid,
       *item.stats);
+    assert(soid > new_last_backfill);
+    new_last_backfill = soid;
   }
-  if (Enqueuing::all_enqueued(peering_state(),
+  if (may_push_to_max &&
+      Enqueuing::all_enqueued(peering_state(),
                               backfill_state().backfill_info,
                               backfill_state().peer_backfill_info) &&
       tracked_objects_completed()) {
     backfill_state().last_backfill_started = hobject_t::get_max();
     backfill_listener().update_peers_last_backfill(hobject_t::get_max());
   } else {
-    backfill_listener().update_peers_last_backfill(obj);
+    backfill_listener().update_peers_last_backfill(new_last_backfill);
   }
 }
 
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index 6c36db81813..ddc0cbf7355 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -336,7 +336,8 @@ struct BackfillState::BackfillListener {
 
   virtual void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) = 0;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) = 0;
 
   virtual void enqueue_drop(
     const pg_shard_t& target,
@@ -375,6 +376,10 @@ struct BackfillState::PeeringFacade {
   virtual void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) = 0;
   virtual bool is_backfilling() const = 0;
+  virtual void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) = 0;
   virtual ~PeeringFacade() {}
 };
 
@@ -421,7 +426,7 @@ public:
 
   bool enqueue_push(const hobject_t&);
   void enqueue_drop(const hobject_t&);
-  void complete_to(const hobject_t&, const pg_stat_t&);
+  void complete_to(const hobject_t&, const pg_stat_t&, bool may_push_to_max);
 };
 
 } // namespace crimson::osd
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index df4f73d4077..9bf60140374 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -504,7 +504,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_snaps.find(clone);
       if (p == ss.clone_snaps.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_snaps, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -518,7 +518,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_overlap.find(clone);
       if (p == ss.clone_overlap.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_overlap, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -532,7 +532,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
       auto p = ss.clone_size.find(clone);
       if (p == ss.clone_size.end()) {
 	logger().error(
-	  "OpsExecutor::do_list_snaps: {} has inconsistent "
+	  "OpsExecuter::do_list_snaps: {} has inconsistent "
 	  "clone_size, missing clone {}",
 	  os.oi.soid,
 	  clone);
@@ -551,7 +551,7 @@ OpsExecuter::list_snaps_iertr::future<> OpsExecuter::do_list_snaps(
   }
   resp.seq = ss.seq;
   logger().error(
-    "OpsExecutor::do_list_snaps: {}, resp.clones.size(): {}",
+    "OpsExecuter::do_list_snaps: {}, resp.clones.size(): {}",
     os.oi.soid,
     resp.clones.size());
   resp.encode(osd_op.outdata);
@@ -678,16 +678,32 @@ OpsExecuter::do_execute_op(OSDOp& osd_op)
       whiteout = true;
     }
     return do_write_op([this, whiteout](auto& backend, auto& os, auto& txn) {
-      int num_bytes = 0;
-      // Calculate num_bytes to be removed
-      if (obc->obs.oi.soid.is_snap()) {
-        ceph_assert(obc->ssc->snapset.clone_overlap.count(obc->obs.oi.soid.snap));
-        num_bytes = obc->ssc->snapset.get_clone_bytes(obc->obs.oi.soid.snap);
-      } else {
-        num_bytes = obc->obs.oi.size;
-      }
-      return backend.remove(os, txn, *osd_op_params,
-                            delta_stats, whiteout, num_bytes);
+      struct emptyctx_t {};
+      return with_effect_on_obc(
+	emptyctx_t{},
+	[&](auto &ctx) {
+	  int num_bytes = 0;
+	  // Calculate num_bytes to be removed
+	  if (obc->obs.oi.soid.is_snap()) {
+	    ceph_assert(obc->ssc->snapset.clone_overlap.count(
+			  obc->obs.oi.soid.snap));
+	    num_bytes = obc->ssc->snapset.get_clone_bytes(
+	      obc->obs.oi.soid.snap);
+	  } else {
+	    num_bytes = obc->obs.oi.size;
+	  }
+	  return backend.remove(os, txn, *osd_op_params,
+				delta_stats, whiteout, num_bytes);
+	},
+	[](auto &&ctx, ObjectContextRef obc, Ref<PG>) {
+	  return seastar::do_for_each(
+	    obc->watchers,
+	    [](auto &p) { return p.second->remove(); }
+	  ).then([obc] {
+	    obc->watchers.clear();
+	    return seastar::now();
+	  });
+	});
     });
   }
   case CEPH_OSD_OP_CALL:
@@ -957,7 +973,7 @@ void OpsExecuter::CloningContext::apply_to(
   processed_obc.ssc->snapset = std::move(new_snapset);
 }
 
-OpsExecuter::interruptible_future<std::vector<pg_log_entry_t>>
+std::vector<pg_log_entry_t>
 OpsExecuter::flush_clone_metadata(
   std::vector<pg_log_entry_t>&& log_entries,
   SnapMapper& snap_mapper,
@@ -965,7 +981,6 @@ OpsExecuter::flush_clone_metadata(
   ceph::os::Transaction& txn)
 {
   assert(!txn.empty());
-  auto maybe_snap_mapped = interruptor::now();
   update_clone_overlap();
   if (cloning_ctx) {
     std::move(*cloning_ctx).apply_to(log_entries, *obc);
@@ -977,12 +992,7 @@ OpsExecuter::flush_clone_metadata(
   }
   logger().debug("{} done, initial snapset={}, new snapset={}",
     __func__, obc->obs.oi.soid, obc->ssc->snapset);
-  return std::move(
-    maybe_snap_mapped
-  ).then_interruptible([log_entries=std::move(log_entries)]() mutable {
-    return interruptor::make_ready_future<std::vector<pg_log_entry_t>>(
-      std::move(log_entries));
-  });
+  return std::move(log_entries);
 }
 
 ObjectContextRef OpsExecuter::prepare_clone(
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 0dea7d0515e..e770e825b32 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -40,7 +40,7 @@ namespace crimson::osd {
 class PG;
 
 // OpsExecuter -- a class for executing ops targeting a certain object.
-class OpsExecuter : public seastar::enable_lw_shared_from_this<OpsExecuter> {
+class OpsExecuter {
   friend class SnapTrimObjSubEvent;
 
   using call_errorator = crimson::errorator<
@@ -170,16 +170,12 @@ public:
 
   object_stat_sum_t delta_stats;
 private:
-  // an operation can be divided into two stages: main and effect-exposing
-  // one. The former is performed immediately on call to `do_osd_op()` while
-  // the later on `submit_changes()` – after successfully processing main
-  // stages of all involved operations. When any stage fails, none of all
-  // scheduled effect-exposing stages will be executed.
-  // when operation requires this division, some variant of `with_effect()`
-  // should be used.
+  // with_effect can be used to schedule operations to be performed
+  // at commit time.  effects will be discarded if the operation does
+  // not commit.
   struct effect_t {
     // an effect can affect PG, i.e. create a watch timeout
-    virtual osd_op_errorator::future<> execute(Ref<PG> pg) = 0;
+    virtual seastar::future<> execute(Ref<PG> pg) = 0;
     virtual ~effect_t() = default;
   };
 
@@ -213,10 +209,10 @@ private:
    * execute_clone
    *
    * If snapc contains a snap which occurred logically after the last write
-   * seen by this object (see OpsExecutor::should_clone()), we first need
+   * seen by this object (see OpsExecuter::should_clone()), we first need
    * make a clone of the object at its current state.  execute_clone primes
    * txn with that clone operation and returns an
-   * OpsExecutor::CloningContext which will allow us to fill in the corresponding
+   * OpsExecuter::CloningContext which will allow us to fill in the corresponding
    * metadata and log_entries once the operations have been processed.
    *
    * Note that this strategy differs from classic, which instead performs this
@@ -267,7 +263,7 @@ private:
   */
   void update_clone_overlap();
 
-  interruptible_future<std::vector<pg_log_entry_t>> flush_clone_metadata(
+  std::vector<pg_log_entry_t> flush_clone_metadata(
     std::vector<pg_log_entry_t>&& log_entries,
     SnapMapper& snap_mapper,
     OSDriver& osdriver,
@@ -400,7 +396,7 @@ public:
   execute_op(OSDOp& osd_op);
 
   using rep_op_fut_tuple =
-    std::tuple<interruptible_future<>, osd_op_ierrorator::future<>>;
+    std::tuple<interruptible_future<>, interruptible_future<>>;
   using rep_op_fut_t =
     interruptible_future<rep_op_fut_tuple>;
   template <typename MutFunc>
@@ -475,7 +471,7 @@ auto OpsExecuter::with_effect_on_obc(
          effect_func(std::move(effect_func)),
          obc(std::move(obc)) {
     }
-    osd_op_errorator::future<> execute(Ref<PG> pg) final {
+    seastar::future<> execute(Ref<PG> pg) final {
       return std::move(effect_func)(std::move(ctx),
                                     std::move(obc),
                                     std::move(pg));
@@ -502,15 +498,14 @@ OpsExecuter::flush_changes_n_do_ops_effects(
   assert(obc);
 
   auto submitted = interruptor::now();
-  auto all_completed =
-    interruptor::make_interruptible(osd_op_errorator::now());
+  auto all_completed = interruptor::now();
 
   if (cloning_ctx) {
     ceph_assert(want_mutate);
   }
 
   if (want_mutate) {
-    auto log_entries = co_await flush_clone_metadata(
+    auto log_entries = flush_clone_metadata(
       prepare_transaction(ops),
       snap_mapper,
       osdriver,
@@ -536,7 +531,7 @@ OpsExecuter::flush_changes_n_do_ops_effects(
     // need extra ref pg due to apply_stats() which can be executed after
     // informing snap mapper
     all_completed =
-      std::move(all_completed).safe_then_interruptible([this, pg=this->pg] {
+      std::move(all_completed).then_interruptible([this, pg=this->pg] {
       // let's do the cleaning of `op_effects` in destructor
       return interruptor::do_for_each(op_effects,
         [pg=std::move(pg)](auto& op_effect) {
@@ -552,21 +547,19 @@ OpsExecuter::flush_changes_n_do_ops_effects(
 
 template <class Func>
 struct OpsExecuter::RollbackHelper {
-  void rollback_obc_if_modified(const std::error_code& e);
-  seastar::lw_shared_ptr<OpsExecuter> ox;
+  void rollback_obc_if_modified();
+  OpsExecuter *ox;
   Func func;
 };
 
 template <class Func>
 inline OpsExecuter::RollbackHelper<Func>
 OpsExecuter::create_rollbacker(Func&& func) {
-  return {shared_from_this(), std::forward<Func>(func)};
+  return {this, std::forward<Func>(func)};
 }
 
-
 template <class Func>
-void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
-  const std::error_code& e)
+void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified()
 {
   // Oops, an operation had failed. do_osd_ops() altogether with
   // OpsExecuter already dropped the ObjectStore::Transaction if
@@ -584,10 +577,9 @@ void OpsExecuter::RollbackHelper<Func>::rollback_obc_if_modified(
   assert(ox);
   const auto need_rollback = ox->has_seen_write();
   crimson::get_logger(ceph_subsys_osd).debug(
-    "{}: object {} got error {}, need_rollback={}",
+    "{}: object {} got error, need_rollback={}",
     __func__,
     ox->obc->get_oid(),
-    e,
     need_rollback);
   if (need_rollback) {
     func(ox->obc);
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 8d2d10fbd7c..34ad97ceb06 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -23,6 +23,7 @@
 #include "messages/MOSDOp.h"
 #include "messages/MOSDPeeringOp.h"
 #include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGRemove.h"
 #include "messages/MOSDPGUpdateLogMissing.h"
 #include "messages/MOSDPGUpdateLogMissingReply.h"
 #include "messages/MOSDRepOpReply.h"
@@ -863,6 +864,8 @@ OSD::do_ms_dispatch(
     [[fallthrough]];
   case MSG_OSD_PG_LOG:
     return handle_peering_op(conn, boost::static_pointer_cast<MOSDPeeringOp>(m));
+  case MSG_OSD_PG_REMOVE:
+    return handle_pg_remove(conn, boost::static_pointer_cast<MOSDPGRemove>(m));
   case MSG_OSD_REPOP:
     return handle_rep_op(conn, boost::static_pointer_cast<MOSDRepOp>(m));
   case MSG_OSD_REPOPREPLY:
@@ -1555,6 +1558,27 @@ seastar::future<> OSD::handle_peering_op(
     std::move(*evt)).second;
 }
 
+seastar::future<> OSD::handle_pg_remove(
+  crimson::net::ConnectionRef conn,
+  Ref<MOSDPGRemove> m)
+{
+  LOG_PREFIX(OSD::handle_pg_remove);
+  const int from = m->get_source().num();
+  std::vector<seastar::future<>> futs;
+  for (auto &pg : m->pg_list) {
+    DEBUG("{} from {}", pg, from);
+    futs.emplace_back(
+      pg_shard_manager.start_pg_operation<RemotePeeringEvent>(
+	conn,
+	pg_shard_t{from, pg.shard},
+	pg,
+	m->get_epoch(),
+	m->get_epoch(),
+	PeeringState::DeleteStart()).second);
+  }
+  return seastar::when_all_succeed(std::move(futs));
+}
+
 seastar::future<> OSD::check_osdmap_features()
 {
   LOG_PREFIX(OSD::check_osdmap_features);
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index de39d808274..d7d54d5d2c3 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -208,6 +208,8 @@ private:
                                         Ref<MOSDRepOpReply> m);
   seastar::future<> handle_peering_op(crimson::net::ConnectionRef conn,
                                       Ref<MOSDPeeringOp> m);
+  seastar::future<> handle_pg_remove(crimson::net::ConnectionRef conn,
+				     Ref<MOSDPGRemove> m);
   seastar::future<> handle_recovery_subreq(crimson::net::ConnectionRef conn,
                                            Ref<MOSDFastDispatchOp> m);
   seastar::future<> handle_scrub_command(crimson::net::ConnectionRef conn,
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
index fb0432edb8f..fd8b049c0bf 100644
--- a/src/crimson/osd/osd_operation.h
+++ b/src/crimson/osd/osd_operation.h
@@ -40,6 +40,37 @@ struct PerShardPipeline {
   } create_or_wait_pg;
 };
 
+struct PGPeeringPipeline {
+  struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
+    static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
+  } await_map;
+  struct Process : OrderedExclusivePhaseT<Process> {
+    static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
+  } process;
+};
+
+struct CommonPGPipeline {
+  struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
+    static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
+  } wait_for_active;
+  struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
+    static constexpr auto type_name = "CommonPGPipeline::recover_missing";
+  } recover_missing;
+  struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
+    static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
+  } check_already_complete_get_obc;
+  struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
+    static constexpr auto type_name = "CommonPGPipeline::lock_obc";
+  } lock_obc;
+  struct Process : OrderedExclusivePhaseT<Process> {
+    static constexpr auto type_name = "CommonPGPipeline::process";
+  } process;
+  struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
+    static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
+  } wait_repop;
+};
+
+
 enum class OperationTypeCode {
   client_request = 0,
   peering_event,
diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h
index 530732ba710..d2786a95e4d 100644
--- a/src/crimson/osd/osd_operation_external_tracking.h
+++ b/src/crimson/osd/osd_operation_external_tracking.h
@@ -36,7 +36,6 @@ struct LttngBackend
     ClientRequest::PGPipeline::RecoverMissing::
       BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
-    ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
@@ -117,10 +116,6 @@ struct LttngBackend
               const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
   }
 
-  void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
-              const Operation& op,
-              const ClientRequest::PGPipeline::GetOBC& blocker) override {
-  }
 
   void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
               const Operation& op,
@@ -171,7 +166,6 @@ struct HistoricBackend
     ClientRequest::PGPipeline::RecoverMissing::
       BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
-    ClientRequest::PGPipeline::GetOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
     ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
     ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
@@ -252,11 +246,6 @@ struct HistoricBackend
               const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
   }
 
-  void handle(ClientRequest::PGPipeline::GetOBC::BlockingEvent& ev,
-              const Operation& op,
-              const ClientRequest::PGPipeline::GetOBC& blocker) override {
-  }
-
   void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
               const Operation& op,
               const ClientRequest::PGPipeline::LockOBC& blocker) override {
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index 8e9a7c4d749..a89fb2c84bc 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -403,11 +403,6 @@ ClientRequest::process_op(
 		   *pg, *this, this_instance_id);
 	  return do_process(
 	    ihref, pg, obc, this_instance_id
-	  ).handle_error_interruptible(
-	    crimson::ct_error::eagain::handle(
-	      [this, pg, this_instance_id, &ihref]() mutable {
-		return process_op(ihref, pg, this_instance_id);
-	      })
 	  );
 	}
       );
@@ -437,7 +432,7 @@ ClientRequest::process_op(
   co_await std::move(process);
 }
 
-ClientRequest::do_process_iertr::future<>
+ClientRequest::interruptible_future<>
 ClientRequest::do_process(
   instance_handle_t &ihref,
   Ref<PG> pg, crimson::osd::ObjectContextRef obc,
@@ -507,22 +502,128 @@ ClientRequest::do_process(
     co_return;
   }
 
-  auto [submitted, all_completed] = co_await pg->do_osd_ops(
-    m, r_conn, obc, op_info, snapc
+  OpsExecuter ox(pg, obc, op_info, *m, r_conn, snapc);
+  auto ret = co_await pg->run_executer(
+    ox, obc, op_info, m->ops
+  ).si_then([]() -> std::optional<std::error_code> {
+    return std::nullopt;
+  }).handle_error_interruptible(crimson::ct_error::all_same_way(
+    [](auto e) -> std::optional<std::error_code> {
+      return e;
+    })
   );
-  co_await std::move(submitted);
 
-  co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+  auto should_log_error = [](std::error_code e) -> bool {
+    switch (e.value()) {
+    case EDQUOT:
+    case ENOSPC:
+    case EAGAIN:
+      return false;
+    default:
+      return true;
+    }
+  };
 
-  auto reply = co_await std::move(all_completed);
+  if (ret && !should_log_error(*ret)) {
+    co_await reply_op_error(pg, -ret->value());
+    co_return;
+  }
+
+  {
+    auto all_completed = interruptor::now();
+    if (ret) {
+      assert(should_log_error(*ret));
+      if (op_info.may_write()) {
+	auto rep_tid = pg->shard_services.get_tid();
+	auto version = co_await pg->submit_error_log(
+	  m, op_info, obc, *ret, rep_tid);
+
+	all_completed = pg->complete_error_log(
+	  rep_tid, version);
+      }
+      // simply return the error below, leaving all_completed alone
+    } else {
+      auto submitted = interruptor::now();
+      std::tie(submitted, all_completed) = co_await pg->submit_executer(
+	std::move(ox), m->ops);
+      co_await std::move(submitted);
+    }
+    co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+
+    co_await std::move(all_completed);
+  }
 
   co_await ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this);
-  DEBUGDPP("{}.{}: sending response",
-	   *pg, *this, this_instance_id);
-  // TODO: gate the crosscore sending
-  co_await interruptor::make_interruptible(
-    get_foreign_connection().send_with_throttling(std::move(reply))
-  );
+
+  if (ret) {
+    int err = -ret->value();
+    DEBUGDPP("{}: replying with error {}", *pg, *this, err);
+
+    auto reply = crimson::make_message<MOSDOpReply>(
+      m.get(), err, pg->get_osdmap_epoch(), 0, false);
+
+    if (!m->ops.empty() && m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
+      reply->set_result(0);
+    }
+
+    // For all ops except for CMPEXT, the correct error value is encoded
+    // in e. For CMPEXT, osdop.rval has the actual error value.
+    if (err == -ct_error::cmp_fail_error_value) {
+      assert(!m->ops.empty());
+      for (auto &osdop : m->ops) {
+	if (osdop.rval < 0) {
+	  reply->set_result(osdop.rval);
+	  break;
+	}
+      }
+    }
+
+    reply->set_enoent_reply_versions(
+      pg->peering_state.get_info().last_update,
+      pg->peering_state.get_info().last_user_version);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    
+    // TODO: gate the crosscore sending
+    co_await interruptor::make_interruptible(
+      get_foreign_connection().send_with_throttling(std::move(reply)));
+  } else {
+    int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
+    if (op_info.may_read() && result >= 0) {
+      for (auto &osdop : m->ops) {
+	if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+	  result = osdop.rval.code;
+	  break;
+	}
+      }
+    } else if (result > 0 && op_info.may_write() && !op_info.allows_returnvec()) {
+      result = 0;
+    } else if (result < 0 &&
+	     (m->ops.empty() ?
+	      0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+      result = 0;
+    }
+    auto reply = crimson::make_message<MOSDOpReply>(
+      m.get(),
+      result,
+      pg->get_osdmap_epoch(),
+      0,
+      false);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    if (obc->obs.exists) {
+      reply->set_reply_versions(pg->peering_state.get_info().last_update,
+				obc->obs.oi.user_version);
+    } else {
+      reply->set_reply_versions(pg->peering_state.get_info().last_update,
+				pg->peering_state.get_info().last_user_version);
+    }
+    
+    DEBUGDPP("{}.{}: sending response {}",
+	     *pg, *this, this_instance_id, *m);
+    // TODO: gate the crosscore sending
+    co_await interruptor::make_interruptible(
+      get_foreign_connection().send_with_throttling(std::move(reply))
+    );
+  }
 }
 
 bool ClientRequest::is_misdirected(const PG& pg) const
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index ea7aade22ac..6ee57e9874c 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -14,7 +14,6 @@
 #include "crimson/osd/osdmap_gate.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/osd/osd_operations/client_request_common.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg_activation_blocker.h"
 #include "crimson/osd/pg_map.h"
 #include "crimson/osd/scrub/pg_scrubber.h"
@@ -104,7 +103,6 @@ public:
       PGPipeline::RecoverMissing::BlockingEvent,
       scrub::PGScrubber::BlockingEvent,
       PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
-      PGPipeline::GetOBC::BlockingEvent,
       PGPipeline::LockOBC::BlockingEvent,
       PGPipeline::Process::BlockingEvent,
       PGPipeline::WaitRepop::BlockingEvent,
@@ -276,12 +274,7 @@ private:
   interruptible_future<> with_sequencer(FuncT&& func);
   interruptible_future<> reply_op_error(const Ref<PG>& pg, int err);
 
-
-  using do_process_iertr =
-    ::crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      ::crimson::errorator<crimson::ct_error::eagain>>;
-  do_process_iertr::future<> do_process(
+  interruptible_future<> do_process(
     instance_handle_t &ihref,
     Ref<PG> pg,
     crimson::osd::ObjectContextRef obc,
diff --git a/src/crimson/osd/osd_operations/client_request_common.cc b/src/crimson/osd/osd_operations/client_request_common.cc
index a56d58d2066..68638d3a7b1 100644
--- a/src/crimson/osd/osd_operations/client_request_common.cc
+++ b/src/crimson/osd/osd_operations/client_request_common.cc
@@ -71,30 +71,4 @@ CommonClientRequest::do_recover_missing(
   }
 }
 
-bool CommonClientRequest::should_abort_request(
-  const Operation& op,
-  std::exception_ptr eptr)
-{
-  if (*eptr.__cxa_exception_type() ==
-      typeid(::crimson::common::actingset_changed)) {
-    try {
-      std::rethrow_exception(eptr);
-    } catch(::crimson::common::actingset_changed& e) {
-      if (e.is_primary()) {
-        logger().debug("{} {} operation restart, acting set changed", __func__, op);
-        return false;
-      } else {
-        logger().debug("{} {} operation abort, up primary changed", __func__, op);
-        return true;
-      }
-    }
-  } else {
-    assert(*eptr.__cxa_exception_type() ==
-      typeid(crimson::common::system_shutdown_exception));
-    crimson::get_logger(ceph_subsys_osd).debug(
-        "{} {} operation skipped, system shutdown", __func__, op);
-    return true;
-  }
-}
-
 } // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/client_request_common.h b/src/crimson/osd/osd_operations/client_request_common.h
index 951bf653799..4c3cf42777b 100644
--- a/src/crimson/osd/osd_operations/client_request_common.h
+++ b/src/crimson/osd/osd_operations/client_request_common.h
@@ -16,9 +16,6 @@ struct CommonClientRequest {
     Ref<PG> pg,
     const hobject_t& soid,
     const osd_reqid_t& reqid);
-
-  static bool should_abort_request(
-    const crimson::Operation& op, std::exception_ptr eptr);
 };
 
 } // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/common/pg_pipeline.h b/src/crimson/osd/osd_operations/common/pg_pipeline.h
deleted file mode 100644
index 2b2d03ae4b3..00000000000
--- a/src/crimson/osd/osd_operations/common/pg_pipeline.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#pragma once
-
-#include "osd/osd_op_util.h"
-#include "crimson/osd/osd_operation.h"
-
-namespace crimson::osd {
-
-class CommonPGPipeline {
-protected:
-  friend class InternalClientRequest;
-  friend class SnapTrimEvent;
-  friend class SnapTrimObjSubEvent;
-
-  struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
-    static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
-  } wait_for_active;
-  struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
-    static constexpr auto type_name = "CommonPGPipeline::recover_missing";
-  } recover_missing;
-  struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
-    static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
-  } check_already_complete_get_obc;
-  struct GetOBC : OrderedExclusivePhaseT<GetOBC> {
-    static constexpr auto type_name = "CommonPGPipeline::get_obc";
-  } get_obc;
-  struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
-    static constexpr auto type_name = "CommonPGPipeline::lock_obc";
-  } lock_obc;
-  struct Process : OrderedExclusivePhaseT<Process> {
-    static constexpr auto type_name = "CommonPGPipeline::process";
-  } process;
-  struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
-    static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
-  } wait_repop;
-};
-
-} // namespace crimson::osd
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index 2968a6f4385..9e5867caf80 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -50,96 +50,107 @@ CommonPGPipeline& InternalClientRequest::client_pp()
   return pg->request_pg_pipeline;
 }
 
+InternalClientRequest::interruptible_future<>
+InternalClientRequest::do_process(
+  crimson::osd::ObjectContextRef obc,
+  std::vector<OSDOp> &osd_ops)
+{
+  LOG_PREFIX(InternalClientRequest::do_process);
+  auto params = get_do_osd_ops_params();
+  OpsExecuter ox(
+    pg, obc, op_info, params, params.get_connection(), SnapContext{});
+  co_await pg->run_executer(
+    ox, obc, op_info, osd_ops
+  ).handle_error_interruptible(
+    crimson::ct_error::all_same_way(
+      [this, FNAME](auto e) {
+	ERRORDPPI("{}: got unexpected error {}", *pg, *this, e);
+	ceph_assert(0 == "should not return an error");
+	return interruptor::now();
+      })
+  );
+
+  auto [submitted, completed] = co_await pg->submit_executer(
+    std::move(ox), osd_ops);
+
+  co_await std::move(submitted);
+  co_await std::move(completed);
+}
+
+InternalClientRequest::interruptible_future<>
+InternalClientRequest::with_interruption()
+{
+  LOG_PREFIX(InternalClientRequest::with_interruption);
+  co_await enter_stage<interruptor>(
+    client_pp().wait_for_active
+  );
+
+  co_await with_blocking_event<PGActivationBlocker::BlockingEvent,
+			       interruptor>([this] (auto&& trigger) {
+    return pg->wait_for_active_blocker.wait(std::move(trigger));
+  });
+
+  co_await enter_stage<interruptor>(client_pp().recover_missing);
+
+  bool unfound = co_await do_recover_missing(
+    pg, get_target_oid(), osd_reqid_t());
+
+  if (unfound) {
+    throw std::system_error(
+      std::make_error_code(std::errc::operation_canceled),
+      fmt::format("{} is unfound, drop it!", get_target_oid()));
+  }
+  co_await enter_stage<interruptor>(
+    client_pp().check_already_complete_get_obc);
+
+  DEBUGI("{}: getting obc lock", *this);
+
+  auto osd_ops = create_osd_ops();
+
+  DEBUGI("InternalClientRequest: got {} OSDOps to execute",
+	 std::size(osd_ops));
+  [[maybe_unused]] const int ret = op_info.set_from_op(
+    std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
+  assert(ret == 0);
+  // call with_locked_obc() in order, but wait concurrently for loading.
+  enter_stage_sync(client_pp().lock_obc);
+
+  auto fut = pg->with_locked_obc(
+    get_target_oid(), op_info,
+    [&osd_ops, this](auto, auto obc) {
+      return enter_stage<interruptor>(client_pp().process
+      ).then_interruptible(
+	[obc=std::move(obc), &osd_ops, this]() mutable {
+	  return do_process(std::move(obc), osd_ops);
+	});
+    }).handle_error_interruptible(
+      crimson::ct_error::assert_all("unexpected error")
+    );
+  co_await std::move(fut);
+
+  logger().debug("{}: complete", *this);
+  co_await interruptor::make_interruptible(handle.complete());
+  co_return;
+}
+
 seastar::future<> InternalClientRequest::start()
 {
   track_event<StartEvent>();
-  return crimson::common::handle_system_shutdown([this] {
-    return seastar::repeat([this] {
-      LOG_PREFIX(InternalClientRequest::start);
-      DEBUGI("{}: in repeat", *this);
-      return interruptor::with_interruption([this]() mutable {
-        return enter_stage<interruptor>(
-	  client_pp().wait_for_active
-        ).then_interruptible([this] {
-          return with_blocking_event<PGActivationBlocker::BlockingEvent,
-	  			     interruptor>([this] (auto&& trigger) {
-            return pg->wait_for_active_blocker.wait(std::move(trigger));
-          });
-        }).then_interruptible([this] {
-          return enter_stage<interruptor>(
-            client_pp().recover_missing);
-        }).then_interruptible([this] {
-          return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
-        }).then_interruptible([this](bool unfound) {
-          if (unfound) {
-            throw std::system_error(
-              std::make_error_code(std::errc::operation_canceled),
-              fmt::format("{} is unfound, drop it!", get_target_oid()));
-          }
-          return enter_stage<interruptor>(
-            client_pp().get_obc);
-        }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
-          LOG_PREFIX(InternalClientRequest::start);
-          DEBUGI("{}: getting obc lock", *this);
-          return seastar::do_with(create_osd_ops(),
-            [this](auto& osd_ops) mutable {
-            LOG_PREFIX(InternalClientRequest::start);
-            DEBUGI("InternalClientRequest: got {} OSDOps to execute",
-                           std::size(osd_ops));
-            [[maybe_unused]] const int ret = op_info.set_from_op(
-              std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
-            assert(ret == 0);
-            // call with_locked_obc() in order, but wait concurrently for loading.
-            enter_stage_sync(client_pp().lock_obc);
-            return pg->with_locked_obc(get_target_oid(), op_info,
-              [&osd_ops, this](auto, auto obc) {
-              return enter_stage<interruptor>(client_pp().process
-              ).then_interruptible(
-                [obc=std::move(obc), &osd_ops, this] {
-                return pg->do_osd_ops(
-                  std::move(obc),
-                  osd_ops,
-                  std::as_const(op_info),
-                  get_do_osd_ops_params()
-                ).safe_then_unpack_interruptible(
-                  [](auto submitted, auto all_completed) {
-                    return all_completed.handle_error_interruptible(
-                      crimson::ct_error::eagain::handle([] {
-                        return seastar::now();
-                      }));
-                  }, crimson::ct_error::eagain::handle([] {
-                    return interruptor::now();
-                  })
-                );
-              });
-            });
-          });
-        }).si_then([this] {
-          logger().debug("{}: complete", *this);
-          return handle.complete();
-        }).handle_error_interruptible(
-          PG::load_obc_ertr::all_same_way([] {
-            return seastar::now();
-          })
-        ).then_interruptible([] {
-          return seastar::stop_iteration::yes;
-        });
-      }, [this](std::exception_ptr eptr) {
-        if (should_abort_request(*this, std::move(eptr))) {
-          return seastar::stop_iteration::yes;
-        } else {
-          return seastar::stop_iteration::no;
-        }
-      }, pg, start_epoch);
-    }).then([this] {
-      track_event<CompletionEvent>();
-    }).handle_exception_type([](std::system_error &error) {
-      logger().debug("error {}, message: {}", error.code(), error.what());
-      return seastar::now();
-    }).finally([this] {
-      logger().debug("{}: exit", *this);
-      handle.exit();
-    });
+  LOG_PREFIX(InternalClientRequest::start);
+  DEBUGI("{}: in repeat", *this);
+
+  return interruptor::with_interruption([this]() mutable {
+    return with_interruption();
+  }, [](std::exception_ptr eptr) {
+    return seastar::now();
+  }, pg, start_epoch).then([this] {
+    track_event<CompletionEvent>();
+  }).handle_exception_type([](std::system_error &error) {
+    logger().debug("error {}, message: {}", error.code(), error.what());
+    return seastar::now();
+  }).finally([this] {
+    logger().debug("{}: exit", *this);
+    handle.exit();
   });
 }
 
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index f198e584643..6023db0a8db 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -6,7 +6,6 @@
 #include "crimson/common/type_helpers.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/osd/osd_operations/client_request_common.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg.h"
 #include "crimson/osd/pg_activation_blocker.h"
 
@@ -41,6 +40,11 @@ private:
 
   CommonPGPipeline& client_pp();
 
+  InternalClientRequest::interruptible_future<> with_interruption();
+  InternalClientRequest::interruptible_future<> do_process(
+    crimson::osd::ObjectContextRef obc,
+    std::vector<OSDOp> &osd_ops);
+
   seastar::future<> do_process();
 
   Ref<PG> pg;
@@ -56,7 +60,7 @@ public:
     CommonPGPipeline::WaitForActive::BlockingEvent,
     PGActivationBlocker::BlockingEvent,
     CommonPGPipeline::RecoverMissing::BlockingEvent,
-    CommonPGPipeline::GetOBC::BlockingEvent,
+    CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
     CommonPGPipeline::LockOBC::BlockingEvent,
     CommonPGPipeline::Process::BlockingEvent,
     CompletionEvent
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
index 1e6bd957289..85de5c711d6 100644
--- a/src/crimson/osd/osd_operations/peering_event.h
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -23,15 +23,6 @@ class ShardServices;
 class PG;
 class BackfillRecovery;
 
-  struct PGPeeringPipeline {
-    struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
-      static constexpr auto type_name = "PeeringEvent::PGPipeline::await_map";
-    } await_map;
-    struct Process : OrderedExclusivePhaseT<Process> {
-      static constexpr auto type_name = "PeeringEvent::PGPipeline::process";
-    } process;
-  };
-
 template <class T>
 class PeeringEvent : public PhasedOperationT<T> {
   T* that() {
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index 7512b3d108d..9ed0b73cfb4 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -396,7 +396,7 @@ SnapTrimObjSubEvent::start()
   });
 
   co_await enter_stage<interruptor>(
-    client_pp().get_obc);
+    client_pp().check_already_complete_get_obc);
 
   logger().debug("{}: getting obc for {}", *this, coid);
   // end of commonality
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index 06d8f43c2f3..1164b3169d2 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -9,7 +9,6 @@
 #include "crimson/osd/osdmap_gate.h"
 #include "crimson/osd/osd_operation.h"
 #include "crimson/common/subop_blocker.h"
-#include "crimson/osd/osd_operations/common/pg_pipeline.h"
 #include "crimson/osd/pg.h"
 #include "crimson/osd/pg_activation_blocker.h"
 #include "osd/osd_types.h"
@@ -170,7 +169,7 @@ public:
 
   std::tuple<
     StartEvent,
-    CommonPGPipeline::GetOBC::BlockingEvent,
+    CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
     CommonPGPipeline::Process::BlockingEvent,
     CommonPGPipeline::WaitRepop::BlockingEvent,
     CompletionEvent
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index d210773ca30..744a1dbc02b 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -13,6 +13,9 @@
 #include <boost/range/numeric.hpp>
 #include <fmt/format.h>
 #include <fmt/ostream.h>
+
+#include <seastar/util/defer.hh>
+
 #include "include/utime_fmt.h"
 
 #include "common/hobject.h"
@@ -481,6 +484,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
   auto [objs_to_rm, next] = fut.get();
   if (objs_to_rm.empty()) {
     logger().info("all objs removed, removing coll for {}", pgid);
+    t.remove(coll_ref->get_cid(), pgid.make_snapmapper_oid());
     t.remove(coll_ref->get_cid(), pgmeta_oid);
     t.remove_collection(coll_ref->get_cid());
     (void) shard_services.get_store().do_transaction(
@@ -490,7 +494,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
     return {next, false};
   } else {
     for (auto &obj : objs_to_rm) {
-      if (obj == pgmeta_oid) {
+      if (obj == pgmeta_oid || obj.is_internal_pg_local()) {
         continue;
       }
       logger().trace("pg {}, removing obj {}", pgid, obj);
@@ -517,7 +521,8 @@ Context *PG::on_clean()
 {
   recovery_handler->on_pg_clean();
   scrubber.on_primary_active_clean();
-  return nullptr;
+  recovery_finisher = new C_PG_FinishRecovery(*this);
+  return recovery_finisher;
 }
 
 seastar::future<> PG::clear_temp_objects()
@@ -973,150 +978,6 @@ ObjectContextRef duplicate_obc(const ObjectContextRef &obc) {
   return object_context;
 }
 
-template <class Ret, class SuccessFunc, class FailureFunc>
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<Ret>>
-PG::do_osd_ops_execute(
-  seastar::lw_shared_ptr<OpsExecuter> ox,
-  ObjectContextRef obc,
-  const OpInfo &op_info,
-  Ref<MOSDOp> m,
-  std::vector<OSDOp>& ops,
-  SuccessFunc&& success_func,
-  FailureFunc&& failure_func)
-{
-  assert(ox);
-  auto rollbacker = ox->create_rollbacker(
-    [object_context=duplicate_obc(obc)] (auto& obc) mutable {
-    obc->update_from(*object_context);
-  });
-  auto failure_func_ptr = seastar::make_lw_shared(std::move(failure_func));
-  return interruptor::do_for_each(ops, [ox](OSDOp& osd_op) {
-    logger().debug(
-      "do_osd_ops_execute: object {} - handling op {}",
-      ox->get_target(),
-      ceph_osd_op_name(osd_op.op.op));
-    return ox->execute_op(osd_op);
-  }).safe_then_interruptible([this, ox, &ops] {
-    logger().debug(
-      "do_osd_ops_execute: object {} all operations successful",
-      ox->get_target());
-    // check for full
-    if ((ox->delta_stats.num_bytes > 0 ||
-      ox->delta_stats.num_objects > 0) &&
-      get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
-      const auto& m = ox->get_message();
-      if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
-        m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
-        logger().info(" full, but proceeding due to FULL_FORCE or MDS");
-      } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
-        // they tried, they failed.
-        logger().info(" full, replying to FULL_TRY op");
-        if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA))
-          return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-            seastar::now(),
-            OpsExecuter::osd_op_ierrorator::future<>(
-              crimson::ct_error::edquot::make()));
-        else
-          return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-            seastar::now(),
-            OpsExecuter::osd_op_ierrorator::future<>(
-              crimson::ct_error::enospc::make()));
-      } else {
-        // drop request
-        logger().info(" full, dropping request (bad client)");
-        return interruptor::make_ready_future<OpsExecuter::rep_op_fut_tuple>(
-          seastar::now(),
-          OpsExecuter::osd_op_ierrorator::future<>(
-            crimson::ct_error::eagain::make()));
-      }
-    }
-    return std::move(*ox).flush_changes_n_do_ops_effects(
-      ops,
-      snap_mapper,
-      osdriver,
-      [this] (auto&& txn,
-              auto&& obc,
-              auto&& osd_op_p,
-              auto&& log_entries) {
-	logger().debug(
-	  "do_osd_ops_execute: object {} submitting txn",
-	  obc->get_oid());
-        mutate_object(obc, txn, osd_op_p);
-	return submit_transaction(
-          std::move(obc),
-          std::move(txn),
-          std::move(osd_op_p),
-          std::move(log_entries));
-    });
-  }).safe_then_unpack_interruptible(
-    [success_func=std::move(success_func), rollbacker, this, failure_func_ptr, obc]
-    (auto submitted_fut, auto _all_completed_fut) mutable {
-
-    auto all_completed_fut = _all_completed_fut.safe_then_interruptible_tuple(
-      std::move(success_func),
-      crimson::ct_error::object_corrupted::handle(
-      [rollbacker, this, obc] (const std::error_code& e) mutable {
-      // this is a path for EIO. it's special because we want to fix the obejct
-      // and try again. that is, the layer above `PG::do_osd_ops` is supposed to
-      // restart the execution.
-      rollbacker.rollback_obc_if_modified(e);
-      return repair_object(obc->obs.oi.soid,
-                           obc->obs.oi.version
-      ).then_interruptible([] {
-        return do_osd_ops_iertr::future<Ret>{crimson::ct_error::eagain::make()};
-      });
-    }), OpsExecuter::osd_op_errorator::all_same_way(
-        [rollbacker, failure_func_ptr]
-        (const std::error_code& e) mutable {
-          // handle non-fatal errors only
-          ceph_assert(e.value() == EDQUOT ||
-                      e.value() == ENOSPC ||
-                      e.value() == EAGAIN);
-          rollbacker.rollback_obc_if_modified(e);
-          return (*failure_func_ptr)(e);
-    }));
-
-    return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
-      std::move(submitted_fut),
-      std::move(all_completed_fut)
-    );
-  }, OpsExecuter::osd_op_errorator::all_same_way(
-    [this, op_info, m, obc,
-     rollbacker, failure_func_ptr]
-    (const std::error_code& e) mutable {
-    ceph_tid_t rep_tid = shard_services.get_tid();
-    rollbacker.rollback_obc_if_modified(e);
-    // record error log
-    auto maybe_submit_error_log =
-      interruptor::make_ready_future<std::optional<eversion_t>>(std::nullopt);
-    // call submit_error_log only for non-internal clients
-    if constexpr (!std::is_same_v<Ret, void>) {
-      if(op_info.may_write()) {
-        maybe_submit_error_log =
-          submit_error_log(m, op_info, obc, e, rep_tid);
-      }
-    }
-    return maybe_submit_error_log.then_interruptible(
-    [this, failure_func_ptr, e, rep_tid] (auto version) {
-      auto all_completed =
-      [this, failure_func_ptr, e, rep_tid,  version] {
-        if (version.has_value()) {
-          return complete_error_log(rep_tid, version.value()
-          ).then_interruptible([failure_func_ptr, e] {
-            return (*failure_func_ptr)(e);
-          });
-        } else {
-          return (*failure_func_ptr)(e);
-        }
-      };
-      return PG::do_osd_ops_iertr::make_ready_future<pg_rep_op_fut_t<Ret>>(
-        std::move(seastar::now()),
-        std::move(all_completed())
-      );
-    });
-  }));
-}
-
 PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid,
                                          const eversion_t& version)
 {
@@ -1146,7 +1007,7 @@ PG::interruptible_future<> PG::complete_error_log(const ceph_tid_t& rep_tid,
   return result;
 }
 
-PG::interruptible_future<std::optional<eversion_t>> PG::submit_error_log(
+PG::interruptible_future<eversion_t> PG::submit_error_log(
   Ref<MOSDOp> m,
   const OpInfo &op_info,
   ObjectContextRef obc,
@@ -1212,142 +1073,84 @@ PG::interruptible_future<std::optional<eversion_t>> PG::submit_error_log(
         get_collection_ref(), std::move(t)
       ).then([this] {
         peering_state.update_trim_to();
-        return seastar::make_ready_future<std::optional<eversion_t>>(projected_last_update);
+        return seastar::make_ready_future<eversion_t>(projected_last_update);
       });
     });
   });
 }
 
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<MURef<MOSDOpReply>>>
-PG::do_osd_ops(
-  Ref<MOSDOp> m,
-  crimson::net::ConnectionXcoreRef conn,
+PG::run_executer_fut PG::run_executer(
+  OpsExecuter &ox,
   ObjectContextRef obc,
   const OpInfo &op_info,
-  const SnapContext& snapc)
+  std::vector<OSDOp>& ops)
 {
-  if (__builtin_expect(stopping, false)) {
-    throw crimson::common::system_shutdown_exception();
-  }
-  return do_osd_ops_execute<MURef<MOSDOpReply>>(
-    seastar::make_lw_shared<OpsExecuter>(
-      Ref<PG>{this}, obc, op_info, *m, conn, snapc),
-    obc,
-    op_info,
-    m,
-    m->ops,
-    // success_func
-    [this, m, obc, may_write = op_info.may_write(),
-     may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] {
-      // TODO: should stop at the first op which returns a negative retval,
-      //       cmpext uses it for returning the index of first unmatched byte
-      int result = m->ops.empty() ? 0 : m->ops.back().rval.code;
-      if (may_read && result >= 0) {
-        for (auto &osdop : m->ops) {
-          if (osdop.rval < 0 && !(osdop.op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
-            result = osdop.rval.code;
-            break;
-          }
-        }
-      } else if (result > 0 && may_write && !rvec) {
-        result = 0;
-      } else if (result < 0 && (m->ops.empty() ?
-        0 : m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
-        result = 0;
-      }
-      auto reply = crimson::make_message<MOSDOpReply>(m.get(),
-                                             result,
-                                             get_osdmap_epoch(),
-                                             0,
-                                             false);
-      reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
-      logger().debug(
-        "do_osd_ops: {} - object {} sending reply",
-        *m,
-        m->get_hobj());
-      if (obc->obs.exists) {
-        reply->set_reply_versions(peering_state.get_info().last_update,
-          obc->obs.oi.user_version);
-      } else {
-        reply->set_reply_versions(peering_state.get_info().last_update,
-          peering_state.get_info().last_user_version);
-      }
-      return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
-        std::move(reply));
-    },
-    // failure_func
-    [m, this]
-    (const std::error_code& e) {
-    logger().error("do_osd_ops_execute::failure_func {} got error: {}",
-                    *m, e);
-    return log_reply(m, e);
+  LOG_PREFIX(PG::run_executer);
+  auto rollbacker = ox.create_rollbacker(
+    [stored_obc=duplicate_obc(obc)](auto &obc) mutable {
+      obc->update_from(*stored_obc);
+    });
+  auto rollback_on_error = seastar::defer([&rollbacker] {
+    rollbacker.rollback_obc_if_modified();
   });
-}
 
-PG::do_osd_ops_iertr::future<MURef<MOSDOpReply>>
-PG::log_reply(
-  Ref<MOSDOp> m,
-  const std::error_code& e)
-{
-  auto reply = crimson::make_message<MOSDOpReply>(
-    m.get(), -e.value(), get_osdmap_epoch(), 0, false);
-  if (m->ops.empty() ? 0 :
-    m->ops.back().op.flags & CEPH_OSD_OP_FLAG_FAILOK) {
-      reply->set_result(0);
-    }
-  // For all ops except for CMPEXT, the correct error value is encoded
-  // in e.value(). For CMPEXT, osdop.rval has the actual error value.
-  if (e.value() == ct_error::cmp_fail_error_value) {
-    assert(!m->ops.empty());
-    for (auto &osdop : m->ops) {
-      if (osdop.rval < 0) {
-        reply->set_result(osdop.rval);
-        break;
+  for (auto &op: ops) {
+    DEBUGDPP("object {} handle op {}", *this, ox.get_target(), op);
+    co_await ox.execute_op(op);
+  }
+  DEBUGDPP("object {} all operations successful", *this, ox.get_target());
+
+  // check for full
+  if ((ox.delta_stats.num_bytes > 0 ||
+       ox.delta_stats.num_objects > 0) &&
+      get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL)) {
+    const auto& m = ox.get_message();
+    if (m.get_reqid().name.is_mds() ||   // FIXME: ignore MDS for now
+	m.has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+      INFODPP("full, but proceeding due to FULL_FORCE, or MDS", *this);
+    } else if (m.has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+      // they tried, they failed.
+      INFODPP("full, replying to FULL_TRY op", *this);
+      if (get_pgpool().info.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+	co_await run_executer_fut(
+	  crimson::ct_error::edquot::make());
+      } else {
+	co_await run_executer_fut(
+	  crimson::ct_error::enospc::make());
       }
+    } else {
+      // drop request
+      INFODPP("full, dropping request (bad client)", *this);
+      co_await run_executer_fut(
+	crimson::ct_error::eagain::make());
     }
   }
-  reply->set_enoent_reply_versions(
-    peering_state.get_info().last_update,
-    peering_state.get_info().last_user_version);
-  reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
-  return do_osd_ops_iertr::make_ready_future<MURef<MOSDOpReply>>(
-    std::move(reply));
-}
-
-PG::do_osd_ops_iertr::future<PG::pg_rep_op_fut_t<>>
-PG::do_osd_ops(
-  ObjectContextRef obc,
-  std::vector<OSDOp>& ops,
-  const OpInfo &op_info,
-  const do_osd_ops_params_t &&msg_params)
-{
-  // This overload is generally used for internal client requests,
-  // use an empty SnapContext.
-  return seastar::do_with(
-    std::move(msg_params),
-    [=, this, &ops, &op_info](auto &msg_params) {
-    return do_osd_ops_execute<void>(
-      seastar::make_lw_shared<OpsExecuter>(
-        Ref<PG>{this},
-        obc,
-        op_info,
-        msg_params,
-        msg_params.get_connection(),
-        SnapContext{}
-      ),
-      obc,
-      op_info,
-      Ref<MOSDOp>(),
-      ops,
-      // success_func
-      [] {
-        return do_osd_ops_iertr::now();
-      },
-      // failure_func
-      [] (const std::error_code& e) {
-        return do_osd_ops_iertr::now();
-      });
-  });
+  rollback_on_error.cancel();
+}
+
+PG::submit_executer_fut PG::submit_executer(
+  OpsExecuter &&ox,
+  const std::vector<OSDOp>& ops) {
+  LOG_PREFIX(PG::submit_executer);
+  // transaction must commit at this point
+  return std::move(
+    ox
+  ).flush_changes_n_do_ops_effects(
+    ops,
+    snap_mapper,
+    osdriver,
+    [FNAME, this](auto&& txn,
+		  auto&& obc,
+		  auto&& osd_op_p,
+		  auto&& log_entries) {
+      DEBUGDPP("object {} submitting txn", *this, obc->get_oid());
+      mutate_object(obc, txn, osd_op_p);
+      return submit_transaction(
+	std::move(obc),
+	std::move(txn),
+	std::move(osd_op_p),
+	std::move(log_entries));
+    });
 }
 
 PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
@@ -1885,4 +1688,19 @@ void PG::cancel_pglog_based_recovery_op() {
   pglog_based_recovery_op->cancel();
   reset_pglog_based_recovery_op();
 }
+
+void PG::C_PG_FinishRecovery::finish(int r) {
+  LOG_PREFIX(PG::C_PG_FinishRecovery::finish);
+  auto &peering_state = pg.get_peering_state();
+  if (peering_state.is_deleting() || !peering_state.is_clean()) {
+    DEBUGDPP("raced with delete or repair", pg);
+    return;
+  }
+  if (this == pg.recovery_finisher) {
+    peering_state.purge_strays();
+    pg.recovery_finisher = nullptr;
+  } else {
+    DEBUGDPP("stale recovery finsher", pg);
+  }
+}
 }
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 93279a18c56..604f49005ff 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -375,7 +375,7 @@ public:
   }
   void check_blocklisted_watchers() final;
   void clear_primary_state() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
 
   void queue_check_readable(epoch_t last_peering_reset,
@@ -394,7 +394,7 @@ public:
   void on_replica_activate() final;
   void on_activate_complete() final;
   void on_new_interval() final {
-    // Not needed yet
+    recovery_finisher = nullptr;
   }
   Context *on_clean() final;
   void on_activate_committed() final {
@@ -621,7 +621,7 @@ public:
   void dump_primary(Formatter*);
   interruptible_future<> complete_error_log(const ceph_tid_t& rep_tid,
                                        const eversion_t& version);
-  interruptible_future<std::optional<eversion_t>> submit_error_log(
+  interruptible_future<eversion_t> submit_error_log(
     Ref<MOSDOp> m,
     const OpInfo &op_info,
     ObjectContextRef obc,
@@ -645,41 +645,35 @@ private:
     }
   } background_process_lock;
 
-  using do_osd_ops_ertr = crimson::errorator<
-   crimson::ct_error::eagain>;
-  using do_osd_ops_iertr =
-    ::crimson::interruptible::interruptible_errorator<
-      ::crimson::osd::IOInterruptCondition,
-      ::crimson::errorator<crimson::ct_error::eagain>>;
-  template <typename Ret = void>
-  using pg_rep_op_fut_t =
-    std::tuple<interruptible_future<>,
-               do_osd_ops_iertr::future<Ret>>;
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<MURef<MOSDOpReply>>> do_osd_ops(
-    Ref<MOSDOp> m,
-    crimson::net::ConnectionXcoreRef conn,
+  using run_executer_ertr = crimson::compound_errorator_t<
+    OpsExecuter::osd_op_errorator,
+    crimson::errorator<
+      crimson::ct_error::edquot,
+      crimson::ct_error::eagain,
+      crimson::ct_error::enospc
+      >
+    >;
+  using run_executer_iertr = crimson::interruptible::interruptible_errorator<
+    ::crimson::osd::IOInterruptCondition,
+    run_executer_ertr>;
+  using run_executer_fut = run_executer_iertr::future<>;
+  run_executer_fut run_executer(
+    OpsExecuter &ox,
     ObjectContextRef obc,
     const OpInfo &op_info,
-    const SnapContext& snapc);
+    std::vector<OSDOp>& ops);
+
+  using submit_executer_ret = std::tuple<
+    interruptible_future<>,
+    interruptible_future<>>;
+  using submit_executer_fut = interruptible_future<
+    submit_executer_ret>;
+  submit_executer_fut submit_executer(
+    OpsExecuter &&ox,
+    const std::vector<OSDOp>& ops);
 
   struct do_osd_ops_params_t;
-  do_osd_ops_iertr::future<MURef<MOSDOpReply>> log_reply(
-    Ref<MOSDOp> m,
-    const std::error_code& e);
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<>> do_osd_ops(
-    ObjectContextRef obc,
-    std::vector<OSDOp>& ops,
-    const OpInfo &op_info,
-    const do_osd_ops_params_t &&params);
-  template <class Ret, class SuccessFunc, class FailureFunc>
-  do_osd_ops_iertr::future<pg_rep_op_fut_t<Ret>> do_osd_ops_execute(
-    seastar::lw_shared_ptr<OpsExecuter> ox,
-    ObjectContextRef obc,
-    const OpInfo &op_info,
-    Ref<MOSDOp> m,
-    std::vector<OSDOp>& ops,
-    SuccessFunc&& success_func,
-    FailureFunc&& failure_func);
+
   interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
   interruptible_future<
     std::tuple<interruptible_future<>, interruptible_future<>>>
@@ -712,9 +706,17 @@ public:
   }
   seastar::future<> stop();
 private:
+  class C_PG_FinishRecovery : public Context {
+  public:
+    explicit C_PG_FinishRecovery(PG &pg) : pg(pg) {}
+    void finish(int r) override;
+  private:
+    PG& pg;
+  };
   std::unique_ptr<PGBackend> backend;
   std::unique_ptr<RecoveryBackend> recovery_backend;
   std::unique_ptr<PGRecovery> recovery_handler;
+  C_PG_FinishRecovery *recovery_finisher;
 
   PeeringState peering_state;
   eversion_t projected_last_update;
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
index fa8201b61c2..24a381b4cf7 100644
--- a/src/crimson/osd/pg_backend.cc
+++ b/src/crimson/osd/pg_backend.cc
@@ -1289,7 +1289,7 @@ void PGBackend::clone(
   const ObjectState& d_os,
   ceph::os::Transaction& txn)
 {
-  // See OpsExecutor::execute_clone documentation
+  // See OpsExecuter::execute_clone documentation
   txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid});
   {
     ceph::bufferlist bv;
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index 4f874d526b3..ec3af0d2b00 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -528,10 +528,12 @@ void PGRecovery::request_primary_scan(
 
 void PGRecovery::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &peers)
 {
-  logger().info("{}: obj={} v={}",
-                 __func__, obj, v);
+  logger().info("{}: obj={} v={} peers={}", __func__, obj, v, peers);
+  auto &peering_state = pg->get_peering_state();
+  peering_state.prepare_backfill_for_missing(obj, v, peers);
   auto [recovering, added] = pg->get_recovery_backend()->add_recovering(obj);
   if (!added)
     return;
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 6cd29c3dc52..705b3176b97 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -110,7 +110,8 @@ private:
     const hobject_t& begin) final;
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) final;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) final;
   void enqueue_drop(
     const pg_shard_t& target,
     const hobject_t& obj,
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index 5f7c4a62447..a053d9d5044 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -767,20 +767,26 @@ seastar::future<> ShardServices::dispatch_context_transaction(
   LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
   if (ctx.transaction.empty()) {
     DEBUG("empty transaction");
-    return seastar::now();
+    co_await get_store().flush(col);
+    Context* on_commit(
+      ceph::os::Transaction::collect_all_contexts(ctx.transaction));
+    if (on_commit) {
+      on_commit->complete(0);
+    }
+    co_return;
   }
 
   DEBUG("do_transaction ...");
-  auto ret = get_store().do_transaction(
+  co_await get_store().do_transaction(
     col,
     ctx.transaction.claim_and_reset());
-  return ret;
+  co_return;
 }
 
 seastar::future<> ShardServices::dispatch_context_messages(
   BufferedRecoveryMessages &&ctx)
 {
-  LOG_PREFIX(OSDSingletonState::dispatch_context_transaction);
+  LOG_PREFIX(OSDSingletonState::dispatch_context_messages);
   auto ret = seastar::parallel_for_each(std::move(ctx.message_map),
     [FNAME, this](auto& osd_messages) {
       auto& [peer, messages] = osd_messages;
diff --git a/src/crypto/isa-l/CMakeLists.txt b/src/crypto/isa-l/CMakeLists.txt
index 40da7e495c3..af8f7e185c8 100644
--- a/src/crypto/isa-l/CMakeLists.txt
+++ b/src/crypto/isa-l/CMakeLists.txt
@@ -1,36 +1,17 @@
-set(isal_dir ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
-set(CMAKE_ASM_FLAGS "-i ${isal_dir}/aes/ -i ${isal_dir}/include/ ${CMAKE_ASM_FLAGS}")
+# build isa-l_crypto from its makefile and expose as target ISAL::Crypto
+include(BuildISALCrypto)
+build_isal_crypto()
 
 set(isal_crypto_plugin_srcs
   isal_crypto_accel.cc 
-  isal_crypto_plugin.cc
-  ${isal_dir}/aes/cbc_pre.c
-  ${isal_dir}/aes/cbc_multibinary.asm
-  ${isal_dir}/aes/keyexp_128.asm
-  ${isal_dir}/aes/keyexp_192.asm
-  ${isal_dir}/aes/keyexp_256.asm
-  ${isal_dir}/aes/keyexp_multibinary.asm
-  ${isal_dir}/aes/cbc_dec_128_x4_sse.asm
-  ${isal_dir}/aes/cbc_dec_128_x8_avx.asm
-  ${isal_dir}/aes/cbc_dec_192_x4_sse.asm
-  ${isal_dir}/aes/cbc_dec_192_x8_avx.asm
-  ${isal_dir}/aes/cbc_dec_256_x4_sse.asm
-  ${isal_dir}/aes/cbc_dec_256_x8_avx.asm
-  ${isal_dir}/aes/cbc_enc_128_x4_sb.asm
-  ${isal_dir}/aes/cbc_enc_128_x8_sb.asm
-  ${isal_dir}/aes/cbc_enc_192_x4_sb.asm
-  ${isal_dir}/aes/cbc_enc_192_x8_sb.asm
-  ${isal_dir}/aes/cbc_enc_256_x4_sb.asm
-  ${isal_dir}/aes/cbc_enc_256_x8_sb.asm)
+  isal_crypto_plugin.cc)
 
 if(HAVE_NASM_X64)
 add_dependencies(crypto_plugins ceph_crypto_isal)
 endif(HAVE_NASM_X64)
 
 add_library(ceph_crypto_isal SHARED ${isal_crypto_plugin_srcs})
-target_include_directories(ceph_crypto_isal PRIVATE ${isal_dir}/include)
-
-target_link_libraries(ceph_crypto_isal PRIVATE Boost::context)
+target_link_libraries(ceph_crypto_isal PRIVATE ISAL::Crypto Boost::context)
 
 set_target_properties(ceph_crypto_isal PROPERTIES
   VERSION 1.0.0
diff --git a/src/doc/rgw/cloud-restore.md b/src/doc/rgw/cloud-restore.md
new file mode 100644
index 00000000000..d54b18dfa50
--- /dev/null
+++ b/src/doc/rgw/cloud-restore.md
@@ -0,0 +1,127 @@
+# cloud-restore
+
+## Introduction
+
+[`cloud-transition`](https://docs.ceph.com/en/latest/radosgw/cloud-transition) feature enables data transition to a remote cloud service as part of Lifecycle Configuration via Storage Classes. However the transition is unidirectional; data cannot be transitioned back from the remote zone.
+
+The `cloud-restore` feature enables restoration of those transitioned objects from the remote cloud S3 endpoints back into RGW.
+
+The objects can be restored either by using S3 `restore-object` CLI or via `read-through`. The restored copies can be either temporary or permanent.
+
+## S3 restore-object CLI
+
+The goal here is to implement minimal functionality of [`S3RestoreObject`](https://docs.aws.amazon.com/cli/latest/reference/s3api/restore-object.html) API so that users can restore the cloud transitioned objects.
+
+```sh
+aws s3api restore-object \
+                    --bucket <value> \
+                    --key <value>  ( can be object name or * for Bulk restore) \
+                    [--version-id <value>] \
+                    --restore-request (structure) {
+                     // for temporary restore
+                        { "Days": integer, }  
+                        // if Days not provided, it will be considered as permanent copy
+                    }
+```
+
+This CLI may be extended in future to include custom parameters (like target-bucket/storage-class etc) specific to RGW.
+
+## read-through
+
+As per the cloud-transition feature functionality, the cloud-transitioned objects cannot be read. `GET` on those objects fails with ‘InvalidObjectState’ error.
+
+But using this restore feature, transitioned objects can be restored and read. New tier-config options `allow_read_through` and `read_through_restore_days` are added for the same. Only when `allow_read_through` is enabled, `GET` on the transitioned objects will restore the objects from the S3 endpoint.
+
+Note: The object copy restored via `readthrough` is temporary and is retained only for the duration of `read_through_restore_days`.
+
+## Design
+
+* Similar to cloud-transition feature, this feature currently works for **only s3 compatible cloud endpoint**.
+* This feature works for only **cloud-transitioned objects**. In order to validate this, `retain_head_object` option should be set to true so that the object’s `HEAD` object can be verified before restoring the object.
+
+* **Request flow:**
+  * Once the `HEAD` object is verified, its cloudtier storage class config details are fetched.
+Note: Incase the cloudtier storage-class is deleted/updated, the object may not be restored.
+  * RestoreStatus for the `HEAD` object is marked `RestoreAlreadyInProgress`
+  * Object Restore is done asynchronously by issuing either S3 `GET` or S3 `RESTORE` request to the remote endpoint.
+  * Once the object is restored, RestoreStaus is updated as `CloudRestored` and RestoreType is set to either `Temporary` or `Permanent`.
+  * Incase the operation fails, RestoreStatus is marked as `RestoreFailed`.
+
+* **New attrs:** Below are the new attrs being added
+  * `user.rgw.restore-status`: <Restore operation Status>
+  * `user.rgw.restore-type`: <Type of Restore>
+  * `user.rgw.restored-at`: <Restoration Time>
+  * `user.rgw.restore-expiry-date`: <Expiration time incase of temporary copies>
+  * `user.rgw.cloudtier_storage_class`: <CloudTier storage class used in case of temporarily restored copies>
+
+```cpp
+        enum RGWRestoreStatus : uint8_t {
+          None  = 0,
+          RestoreAlreadyInProgress = 1,
+          CloudRestored = 2,
+          RestoreFailed = 3
+        };
+        enum class RGWRestoreType : uint8_t {
+          None = 0,
+          Temporary = 1,
+          Permanent = 2
+        };
+```
+
+* **Response:**
+* `S3 restore-object CLI`  returns SUCCESS - either the 200 OK or 202 Accepted status code.
+  * If the object is not previously restored, then RGW returns 202 Accepted in the response.
+  * If the object is previously restored, RGW returns 200 OK in the response.
+    * Special errors:
+        Code: RestoreAlreadyInProgress ( Cause: Object restore is already in progress.)
+        Code: ObjectNotFound (if Object is not found in cloud endpoint)
+        Code: I/O error (for any other I/O errors during restore)
+* `GET request` continues to return an  ‘InvalidObjectState’ error till the object is successfully restored.
+  * S3 head-object can be used to verify if the restore is still in progress.
+  * Once the object is restored, GET will return the object data.
+
+* **StorageClass**: By default, the objects are restored to `STANDARD` storage class. However, as per [AWS S3 Restore](https://docs.aws.amazon.com/cli/latest/reference/s3api/restore-object.html) the storage-class remains the same for restored objects. Hence for the temporary copies, the `x-amz-storage-class` returned contains original cloudtier storage-class.
+  * Note: A new tier-config option may be added to select the storage-class to restore the objects to.
+
+* **mtime**: If the restored object is temporary, object is still marked `RGWObj::CloudTiered`  and mtime is not changed i.e, still set to transition time. But in case the object is permanent copy, it is marked `RGWObj::Main` and mtime is updated to the restore time (now()).
+
+* **Lifecycle**:
+  * `Temporary` copies are not subjected to any further transition to the cloud. However (as is the case with cloud-transitioned objects) they can be deleted via regular LC expiration rules or via external S3 Delete request.
+  * `Permanent` copies are treated as any regular objects and are subjected to any LC rules applicable.
+
+* **Replication**:  The restored objects (both temporary and permanent) are also replicated like regular objects and will be deleted across the zones post expiration.
+
+* **VersionedObjects** : In case of versioning, if any object is cloud-transitioned, it would have been non-current. Post restore too, the same non-current object will be updated with the downloaded data and its HEAD object will be updated accordingly as the case with regular objects.
+
+* **Temporary Object Expiry**: This is done via Object Expirer
+  * When the object is restored as temporary, `user.rgw.expiry-date` is set accordingly and `delete_at` attr is also updated with the same value.
+  * This object is then added to the list used by `ObjectExpirer`.
+  * `LC` worker thread is used to scan through that list and post expiry, resets the objects back to cloud-transitioned state i.e,
+    * HEAD object with size=0
+    * new attrs removed
+    * `delete_at` reset
+  * Note: A new RGW option `rgw_restore_debug_interval` is added, which when set will be considered as `Days` value (similar to `rgw_lc_debug_interval`).
+
+* **FAILED Restore**: In case the restore operation fails,
+  * The HEAD object will be updated accordingly.. i.e, Storage-class is reset to the original cloud-tier storage class
+  * All the new attrs added will be removed , except for `user.rgw.restore-status` which will be updated as `RestoreFailed`
+
+* **Check Restore Progress**: Users can issue S3 `head-object` request to check if the restore is done or still in progress for any object.
+
+* **RGW down/restarts** - Since the restore operation is asynchronous, we need to keep track of the objects being restored. In case RGW is down/restarts, this data will be used to retrigger on-going restore requests or do appropriate cleanup for the failed requests.
+
+* **Compression** - If the placement-target to which the objects are being restored to has compression enabled, the data will be compressed accordingly (bug2294512)
+
+* **Encryption** - If the restored object is encrypted, the old sse-related xattrs/keys from the HEAD stub will be copied back into object metadata (bug2294512)
+
+* **Delete cloud object post restore** - Once the object is successfully restored, the object at the remote endpoint is still retained. However we could choose to delete it for permanent restored copies by adding new tier-config option.
+
+## Future work
+
+* **Bulk Restore**: In the case of BulkRestore, some of the objects may not be restored. User needs to manually cross-check the objects to check the objects restored or InProgress.
+
+* **Admin CLIs**: Admin debug commands will be provided to start, check the status and cancel the restore operations.
+
+* **Admin Ops**
+
+* **Restore Notifications**
diff --git a/src/erasure-code/isa/CMakeLists.txt b/src/erasure-code/isa/CMakeLists.txt
index 2ca398ffcb1..6162075cbc8 100644
--- a/src/erasure-code/isa/CMakeLists.txt
+++ b/src/erasure-code/isa/CMakeLists.txt
@@ -1,113 +1,18 @@
-# ISA
-set(isal_src_dir ${CMAKE_SOURCE_DIR}/src/isa-l)
-include_directories(${isal_src_dir}/include)
+# build isa-l from its makefile and expose as target ISAL::ISAL
+include(BuildISAL)
+build_isal()
 
-if(HAVE_NASM_X64_AVX2)
-  set(CMAKE_ASM_FLAGS "-i ${isal_src_dir}/include/ ${CMAKE_ASM_FLAGS}")
-  set(isa_srcs
-    ${isal_src_dir}/erasure_code/ec_base.c
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_sse.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_avx2.asm
-    ${isal_src_dir}/erasure_code/ec_highlevel_func.c
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_avx.asm
-    ${isal_src_dir}/erasure_code/ec_multibinary.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_sse.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_avx2.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mul_avx.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_5vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_6vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_avx.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mul_sse.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_2vect_mad_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_3vect_mad_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_4vect_mad_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_vect_dot_prod_avx512.asm
-    ${isal_src_dir}/erasure_code/gf_vect_mad_avx512.asm
-    ${isal_src_dir}/raid/raid_base.c
-    ${isal_src_dir}/raid/raid_multibinary.asm
-    ${isal_src_dir}/raid/xor_check_sse.asm
-    ${isal_src_dir}/raid/xor_gen_sse.asm
-    ${isal_src_dir}/raid/xor_gen_avx.asm
-    ${isal_src_dir}/raid/xor_gen_avx512.asm
-    ${isal_src_dir}/raid/pq_check_sse.asm
-    ${isal_src_dir}/raid/pq_gen_sse.asm
-    ${isal_src_dir}/raid/pq_gen_avx.asm
-    ${isal_src_dir}/raid/pq_gen_avx2.asm
-    ErasureCodeIsa.cc
-    ErasureCodeIsaTableCache.cc
-    ErasureCodePluginIsa.cc
-  )
-elseif(HAVE_ARMV8_SIMD)
-  set(isa_srcs
-    ${isal_src_dir}/erasure_code/ec_base.c
-    ${isal_src_dir}/erasure_code/aarch64/ec_aarch64_highlevel_func.c
-    ${isal_src_dir}/erasure_code/aarch64/ec_aarch64_dispatcher.c
-    ${isal_src_dir}/erasure_code/aarch64/gf_2vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_2vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_3vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_3vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_4vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_4vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_5vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_5vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_6vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_vect_dot_prod_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_vect_mad_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/gf_vect_mul_neon.S
-    ${isal_src_dir}/erasure_code/aarch64/ec_multibinary_arm.S
-    ${isal_src_dir}/raid/raid_base.c
-    ${isal_src_dir}/raid/aarch64/raid_aarch64_dispatcher.c
-    ${isal_src_dir}/raid/aarch64/raid_multibinary_arm.S
-    ${isal_src_dir}/raid/aarch64/xor_check_neon.S
-    ${isal_src_dir}/raid/aarch64/xor_gen_neon.S
-    ${isal_src_dir}/raid/aarch64/pq_check_neon.S
-    ${isal_src_dir}/raid/aarch64/pq_gen_neon.S
-    ErasureCodeIsa.cc
-    ErasureCodeIsaTableCache.cc
-    ErasureCodePluginIsa.cc
-  )
-  set_source_files_properties(
-    ${isal_src_dir}/erasure_code/aarch64/ec_multibinary_arm.S
-    ${isal_src_dir}/raid/aarch64/raid_multibinary_arm.S
-    PROPERTIES COMPILE_FLAGS "-D__ASSEMBLY__"
-  )
-endif()
+# ISA
+set(isa_srcs
+  ErasureCodeIsa.cc
+  ErasureCodeIsaTableCache.cc
+  ErasureCodePluginIsa.cc
+)
 
 add_library(ec_isa SHARED
   ${isa_srcs}
   $<TARGET_OBJECTS:erasure_code_objs>)
-target_link_libraries(ec_isa ${EXTRALIBS})
+target_link_libraries(ec_isa ISAL::ISAL ${EXTRALIBS})
 set_target_properties(ec_isa PROPERTIES
   INSTALL_RPATH "")
 install(TARGETS ec_isa DESTINATION ${erasure_plugin_dir})
diff --git a/src/exporter/DaemonMetricCollector.cc b/src/exporter/DaemonMetricCollector.cc
index d4930ea35c0..4b8a8131bcf 100644
--- a/src/exporter/DaemonMetricCollector.cc
+++ b/src/exporter/DaemonMetricCollector.cc
@@ -168,10 +168,17 @@ void DaemonMetricCollector::dump_asok_metrics(bool sort_metrics, int64_t counter
     if (sockClientsPing) {
       bool ok;
       sock_client.ping(&ok);
+      std::string ceph_daemon_socket_up_desc(
+      "Reports the health status of a Ceph daemon, as determined by whether it is able to respond via its admin socket (1 = healthy, 0 = unhealthy).");
+      labels_t ceph_daemon_socket_up_labels;
+      ceph_daemon_socket_up_labels["hostname"] = quote(ceph_get_hostname());
+      ceph_daemon_socket_up_labels["ceph_daemon"] = quote(daemon_name);
+      add_metric(builder, static_cast<int>(ok), "ceph_daemon_socket_up", ceph_daemon_socket_up_desc,
+             "gauge", ceph_daemon_socket_up_labels);
       if (!ok) {
         failures++;
         continue;
-      } 
+      }
     }
     std::string counter_dump_response = dump_response.size() > 0 ? dump_response :
       asok_request(sock_client, "counter dump", daemon_name);
diff --git a/src/exporter/DaemonMetricCollector.h b/src/exporter/DaemonMetricCollector.h
index d2e929b4d67..3302e95df91 100644
--- a/src/exporter/DaemonMetricCollector.h
+++ b/src/exporter/DaemonMetricCollector.h
@@ -42,11 +42,11 @@ public:
   std::map<std::string, AdminSocketClient> clients;
   std::string metrics;
   std::pair<labels_t, std::string> add_fixed_name_metrics(std::string metric_name);
+  void update_sockets();
 
 private:
   std::mutex metrics_mutex;
   std::unique_ptr<MetricsBuilder> builder;
-  void update_sockets();
   void request_loop(boost::asio::steady_timer &timer);
 
   void dump_asok_metric(boost::json::object perf_info,
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index 57ee5ee7167..79defaec376 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -13,6 +13,7 @@
  */
 
 #include <filesystem>
+#include <memory>
 #include "common/async/context_pool.h"
 #include "common/ceph_argparse.h"
 #include "common/code_environment.h"
@@ -268,10 +269,14 @@ global_init(const std::map<std::string,std::string> *defaults,
     if (g_conf()->setgroup.length() > 0) {
       gid = atoi(g_conf()->setgroup.c_str());
       if (!gid) {
-	char buf[4096];
+	// There's no actual well-defined max that I could find in
+	// library documentation. If we're allocating on the heap,
+	// 64KiB seems at least reasonable.
+	static constexpr std::size_t size = 64 * 1024;
+	auto buf = std::make_unique_for_overwrite<char[]>(size);
 	struct group gr;
 	struct group *g = 0;
-	getgrnam_r(g_conf()->setgroup.c_str(), &gr, buf, sizeof(buf), &g);
+	getgrnam_r(g_conf()->setgroup.c_str(), &gr, buf.get(), size, &g);
 	if (!g) {
 	  cerr << "unable to look up group '" << g_conf()->setgroup << "'"
 	       << ": " << cpp_strerror(errno) << std::endl;
diff --git a/src/include/cephfs/metrics/Types.h b/src/include/cephfs/metrics/Types.h
index d7cf5613861..af377db606e 100644
--- a/src/include/cephfs/metrics/Types.h
+++ b/src/include/cephfs/metrics/Types.h
@@ -688,6 +688,10 @@ public:
     apply_visitor(DumpPayloadVisitor(f), payload);
   }
 
+  static void generate_test_instances(std::list<ClientMetricMessage*>& ls) {
+    ls.push_back(new ClientMetricMessage(CapInfoPayload(1, 2, 3)));
+  }
+
   void print(std::ostream *out) const {
     apply_visitor(PrintPayloadVisitor(out), payload);
   }
diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h
index 73efc73ae9a..435bc104d83 100644
--- a/src/include/cephfs/types.h
+++ b/src/include/cephfs/types.h
@@ -226,7 +226,6 @@ struct vinodeno_t {
     ls.push_back(new vinodeno_t);
     ls.push_back(new vinodeno_t(1, 2));
   }
-
   inodeno_t ino;
   snapid_t snapid;
 };
@@ -371,7 +370,6 @@ public:
   void decode(ceph::buffer::list::const_iterator& bl);
   void dump(ceph::Formatter *f) const;
   static void generate_test_instances(std::list<inline_data_t*>& ls);
-
   version_t version = 1;
 
 private:
diff --git a/src/librados/librados_asio.h b/src/librados/librados_asio.h
index 19a8c8fc01d..0aedc376575 100644
--- a/src/librados/librados_asio.h
+++ b/src/librados/librados_asio.h
@@ -16,6 +16,7 @@
 
 #include "include/rados/librados.hpp"
 #include "common/async/completion.h"
+#include "librados/AioCompletionImpl.h"
 
 /// Defines asynchronous librados operations that satisfy all of the
 /// "Requirements on asynchronous operations" imposed by the C++ Networking TS
@@ -53,20 +54,20 @@ using unique_aio_completion_ptr =
 /// argument to the handler.
 template <typename Result>
 struct Invoker {
-  using Signature = void(boost::system::error_code, Result);
+  using Signature = void(boost::system::error_code, version_t, Result);
   Result result;
   template <typename Completion>
-  void dispatch(Completion&& completion, boost::system::error_code ec) {
-    ceph::async::dispatch(std::move(completion), ec, std::move(result));
+  void dispatch(Completion&& completion, boost::system::error_code ec, version_t ver) {
+    ceph::async::dispatch(std::move(completion), ec, ver, std::move(result));
   }
 };
 // specialization for Result=void
 template <>
 struct Invoker<void> {
-  using Signature = void(boost::system::error_code);
+  using Signature = void(boost::system::error_code, version_t);
   template <typename Completion>
-  void dispatch(Completion&& completion, boost::system::error_code ec) {
-    ceph::async::dispatch(std::move(completion), ec);
+  void dispatch(Completion&& completion, boost::system::error_code ec, version_t ver) {
+    ceph::async::dispatch(std::move(completion), ec, ver);
   }
 };
 
@@ -82,12 +83,15 @@ struct AsyncOp : Invoker<Result> {
     auto p = std::unique_ptr<Completion>{static_cast<Completion*>(arg)};
     // move result out of Completion memory being freed
     auto op = std::move(p->user_data);
-    const int ret = op.aio_completion->get_return_value();
+    // access AioCompletionImpl directly to avoid locking
+    const librados::AioCompletionImpl* pc = op.aio_completion->pc;
+    const int ret = pc->rval;
+    const version_t ver = pc->objver;
     boost::system::error_code ec;
     if (ret < 0) {
       ec.assign(-ret, librados::detail::err_category());
     }
-    op.dispatch(std::move(p), ec);
+    op.dispatch(std::move(p), ec, ver);
   }
 
   template <typename Executor1, typename CompletionHandler>
@@ -103,7 +107,7 @@ struct AsyncOp : Invoker<Result> {
 
 
 /// Calls IoCtx::aio_read() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code, bufferlist).
+/// given handler with signature (error_code, version_t, bufferlist).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                 size_t len, uint64_t off, CompletionToken&& token)
@@ -119,7 +123,7 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
         int ret = io.aio_read(oid, op.aio_completion.get(), &op.result, len, off);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec, bufferlist{});
+          ceph::async::post(std::move(p), ec, 0, bufferlist{});
         } else {
           p.release(); // release ownership until completion
         }
@@ -127,24 +131,24 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_write() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code).
+/// given handler with signature (error_code, version_t).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
-                 bufferlist &bl, size_t len, uint64_t off,
+                 const bufferlist &bl, size_t len, uint64_t off,
                  CompletionToken&& token)
 {
   using Op = detail::AsyncOp<void>;
   using Signature = typename Op::Signature;
   return boost::asio::async_initiate<CompletionToken, Signature>(
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
-          bufferlist &bl, size_t len, uint64_t off) {
+          const bufferlist &bl, size_t len, uint64_t off) {
         auto p = Op::create(ex, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_write(oid, op.aio_completion.get(), bl, len, off);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec);
+          ceph::async::post(std::move(p), ec, 0);
         } else {
           p.release(); // release ownership until completion
         }
@@ -152,7 +156,7 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code, bufferlist).
+/// given handler with signature (error_code, version_t, bufferlist).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                    ObjectReadOperation *read_op, int flags,
@@ -170,7 +174,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                                  flags, &op.result);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec, bufferlist{});
+          ceph::async::post(std::move(p), ec, 0, bufferlist{});
         } else {
           p.release(); // release ownership until completion
         }
@@ -178,7 +182,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code).
+/// given handler with signature (error_code, version_t).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                    ObjectWriteOperation *write_op, int flags,
@@ -196,7 +200,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
         int ret = io.aio_operate(oid, op.aio_completion.get(), write_op, flags, trace_ctx);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec);
+          ceph::async::post(std::move(p), ec, 0);
         } else {
           p.release(); // release ownership until completion
         }
@@ -204,7 +208,7 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 }
 
 /// Calls IoCtx::aio_notify() and arranges for the AioCompletion to call a
-/// given handler with signature (boost::system::error_code, bufferlist).
+/// given handler with signature (error_code, version_t, bufferlist).
 template <typename ExecutionContext, typename CompletionToken>
 auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                   bufferlist& bl, uint64_t timeout_ms, CompletionToken &&token)
@@ -221,7 +225,7 @@ auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                                 bl, timeout_ms, &op.result);
         if (ret < 0) {
           auto ec = boost::system::error_code{-ret, librados::detail::err_category()};
-          ceph::async::post(std::move(p), ec, bufferlist{});
+          ceph::async::post(std::move(p), ec, 0, bufferlist{});
         } else {
           p.release(); // release ownership until completion
         }
diff --git a/src/librbd/crypto/LoadRequest.cc b/src/librbd/crypto/LoadRequest.cc
index 5bc57d693c5..66beed59130 100644
--- a/src/librbd/crypto/LoadRequest.cc
+++ b/src/librbd/crypto/LoadRequest.cc
@@ -31,7 +31,7 @@ LoadRequest<I>::LoadRequest(
         Context* on_finish) : m_image_ctx(image_ctx),
                               m_on_finish(on_finish),
                               m_format_idx(0),
-                              m_is_current_format_cloned(false),
+                              m_is_current_format_assumed(false),
                               m_formats(std::move(formats)) {
 }
 
@@ -108,7 +108,7 @@ void LoadRequest<I>::handle_load(int r) {
   ldout(m_image_ctx->cct, 20) << "r=" << r << dendl;
 
   if (r < 0) {
-    if (m_is_current_format_cloned &&
+    if (m_is_current_format_assumed &&
         m_detected_format_name == UNKNOWN_FORMAT) {
       // encryption format was not detected, assume plaintext
       ldout(m_image_ctx->cct, 5) << "assuming plaintext for image "
@@ -125,19 +125,29 @@ void LoadRequest<I>::handle_load(int r) {
   }
 
   ldout(m_image_ctx->cct, 5) << "loaded format " << m_detected_format_name
-                             << (m_is_current_format_cloned ? " (cloned)" : "")
+                             << (m_is_current_format_assumed ? " (assumed)" : "")
                              << " for image " << m_current_image_ctx->name
                              << dendl;
 
   m_format_idx++;
+  if (!m_current_image_ctx->migration_info.empty()) {
+    // prepend the format to use for the migration source image
+    // it's done implicitly here because this image is moved to the
+    // trash when migration is prepared
+    ceph_assert(m_current_image_ctx->parent != nullptr);
+    ldout(m_image_ctx->cct, 20) << "under migration, cloning format" << dendl;
+    m_formats.insert(m_formats.begin() + m_format_idx,
+                     m_formats[m_format_idx - 1]->clone());
+  }
+
   m_current_image_ctx = m_current_image_ctx->parent;
   if (m_current_image_ctx != nullptr) {
     // move on to loading parent
     if (m_format_idx >= m_formats.size()) {
       // try to load next ancestor using the same format
-      ldout(m_image_ctx->cct, 20) << "cloning format" << dendl;
-      m_is_current_format_cloned = true;
+      ldout(m_image_ctx->cct, 20) << "out of formats, cloning format" << dendl;
       m_formats.push_back(m_formats[m_formats.size() - 1]->clone());
+      m_is_current_format_assumed = true;
     }
 
     load();
diff --git a/src/librbd/crypto/LoadRequest.h b/src/librbd/crypto/LoadRequest.h
index 84f595bb6c6..702748a2418 100644
--- a/src/librbd/crypto/LoadRequest.h
+++ b/src/librbd/crypto/LoadRequest.h
@@ -44,7 +44,7 @@ private:
     Context* m_on_finish;
 
     size_t m_format_idx;
-    bool m_is_current_format_cloned;
+    bool m_is_current_format_assumed;
     std::vector<EncryptionFormat> m_formats;
     I* m_current_image_ctx;
     std::string m_detected_format_name;
diff --git a/src/log/Entry.h b/src/log/Entry.h
index 3677c8eb951..db39eca0ef3 100644
--- a/src/log/Entry.h
+++ b/src/log/Entry.h
@@ -4,9 +4,12 @@
 #ifndef __CEPH_LOG_ENTRY_H
 #define __CEPH_LOG_ENTRY_H
 
+#include "include/compat.h"
+
 #include "log/LogClock.h"
 
 #include "common/StackStringStream.h"
+#include "common/Thread.h"
 
 #include "boost/container/small_vector.hpp"
 
@@ -14,6 +17,7 @@
 
 #include <string_view>
 
+
 namespace ceph {
 namespace logging {
 
@@ -27,7 +31,10 @@ public:
     m_thread(pthread_self()),
     m_prio(pr),
     m_subsys(sub)
-  {}
+  {
+    strncpy(m_thread_name, Thread::get_thread_name().data(), 16);
+    m_thread_name[15] = '\0';
+  }
   Entry(const Entry &) = default;
   Entry& operator=(const Entry &) = default;
   Entry(Entry &&e) = default;
@@ -40,6 +47,7 @@ public:
   time m_stamp;
   pthread_t m_thread;
   short m_prio, m_subsys;
+  char m_thread_name[16];
 
   static log_clock& clock() {
     static log_clock clock;
diff --git a/src/log/Log.cc b/src/log/Log.cc
index 69f6df82ecb..49dd03c06c0 100644
--- a/src/log/Log.cc
+++ b/src/log/Log.cc
@@ -493,13 +493,13 @@ void Log::dump_recent()
   _flush(m_flush, false);
 
   _log_message("--- begin dump of recent events ---", true);
-  std::set<pthread_t> recent_pthread_ids;
+  std::set<std::pair<pthread_t, const char *>> recent_pthread_ids;
   {
     EntryVector t;
     t.insert(t.end(), std::make_move_iterator(m_recent.begin()), std::make_move_iterator(m_recent.end()));
     m_recent.clear();
     for (const auto& e : t) {
-      recent_pthread_ids.emplace(e.m_thread);
+      recent_pthread_ids.emplace(std::make_pair(e.m_thread, e.m_thread_name));
     }
     _flush(t, true);
   }
@@ -515,14 +515,11 @@ void Log::dump_recent()
 			   m_stderr_log, m_stderr_crash), true);
 
   _log_message("--- pthread ID / name mapping for recent threads ---", true);
-  for (const auto pthread_id : recent_pthread_ids)
+  for (auto& [pthread_id, pthread_name] : recent_pthread_ids)
   {
-    char pthread_name[16] = {0}; //limited by 16B include terminating null byte.
-    ceph_pthread_getname(pthread_id, pthread_name, sizeof(pthread_name));
     // we want the ID to be printed in the same format as we use for a log entry.
     // The reason is easier grepping.
-    _log_message(fmt::format("  {:x} / {}",
-			     tid_to_int(pthread_id), pthread_name), true);
+    _log_message(fmt::format("  {:x} / {}", tid_to_int(pthread_id), pthread_name), true);
   }
 
   _log_message(fmt::format("  max_recent {:9}", m_recent.capacity()), true);
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 059b540feb0..642d3428a27 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -17,6 +17,7 @@
 #include "common/likely.h"
 #include "common/HeartbeatMap.h"
 
+#include "include/compat.h" // for ceph_pthread_setname()
 #include "include/stringify.h"
 #include "include/util.h"
 
@@ -73,6 +74,7 @@ void Beacon::init(const MDSMap &mdsmap)
   _notify_mdsmap(mdsmap);
 
   sender = std::thread([this]() {
+    ceph_pthread_setname(pthread_self(), "beacon");
     std::unique_lock<std::mutex> lock(mutex);
     bool sent;
     while (!finished) {
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index f000da7928a..76e9fee68f8 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1319,7 +1319,7 @@ void CDir::take_dentry_waiting(std::string_view dname, snapid_t first, snapid_t
 	     << it->first.snapid
 	     << " on " << *this << dendl;
     std::copy(it->second.begin(), it->second.end(), std::back_inserter(ls));
-    waiting_on_dentry.erase(it++);
+    it = waiting_on_dentry.erase(it);
   }
 
   if (waiting_on_dentry.empty())
@@ -2823,8 +2823,6 @@ void CDir::_committed(int r, version_t v)
 
   auto it = waiting_for_commit.begin();
   while (it != waiting_for_commit.end()) {
-    auto _it = it;
-    ++_it;
     if (it->first > committed_version) {
       dout(10) << " there are waiters for " << it->first << ", committing again" << dendl;
       _commit(it->first, -1);
@@ -2834,8 +2832,7 @@ void CDir::_committed(int r, version_t v)
     for (const auto &waiter : it->second)
       t.push_back(waiter);
     mdcache->mds->queue_waiters(t);
-    waiting_for_commit.erase(it);
-    it = _it;
+    it = waiting_for_commit.erase(it);
 
     if (!(++count % mdcache->mds->heartbeat_reset_grace()))
       mdcache->mds->heartbeat_reset();
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 0e9b6996ad2..dfad411d323 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -4589,8 +4589,11 @@ void InodeStoreBase::dump(Formatter *f) const
     for (const auto& [key, val] : *xattrs) {
       f->open_object_section("xattr");
       f->dump_string("key", key);
-      std::string v(val.c_str(), val.length());
-      f->dump_string("val", v);
+      if (val.length()) {
+        f->dump_string("val", std::string(val.c_str(), val.length()));
+      } else {
+        f->dump_string("val", "");
+      }
       f->close_section();
     }
   }
diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc
index 9a3d093f9eb..ea636b7059a 100644
--- a/src/mds/Capability.cc
+++ b/src/mds/Capability.cc
@@ -73,14 +73,8 @@ void Capability::Export::dump(ceph::Formatter *f) const
 
 void Capability::Export::generate_test_instances(std::list<Capability::Export*>& ls)
 {
-  ls.push_back(new Export);
-  ls.push_back(new Export);
-  ls.back()->wanted = 1;
-  ls.back()->issued = 2;
-  ls.back()->pending = 3;
-  ls.back()->client_follows = 4;
-  ls.back()->mseq = 5;
-  ls.back()->last_issue_stamp = utime_t(6, 7);
+  ls.push_back(new Export());
+  ls.push_back(new Export(1, 2, 3, 4, 5, 6, 7, utime_t(8, 9), 10));
 }
 
 void Capability::Import::encode(ceph::buffer::list &bl) const
@@ -108,6 +102,11 @@ void Capability::Import::dump(ceph::Formatter *f) const
   f->dump_unsigned("migrate_seq", mseq);
 }
 
+void Capability::Import::generate_test_instances(std::list<Capability::Import*>& ls)
+{
+  ls.push_back(new Import());
+  ls.push_back(new Import(1, 2, 3));
+}
 /*
  * Capability::revoke_info
  */
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 9680895a5c8..9adcf3b25b9 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -100,6 +100,7 @@ public:
     void encode(ceph::buffer::list &bl) const;
     void decode(ceph::buffer::list::const_iterator &p);
     void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<Import*>& ls);
 
     int64_t cap_id = 0;
     ceph_seq_t issue_seq = 0;
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
index 88d7fda7c10..7e4df884ca1 100644
--- a/src/mds/FSMap.cc
+++ b/src/mds/FSMap.cc
@@ -115,6 +115,14 @@ void MirrorInfo::dump(ceph::Formatter *f) const {
   f->close_section(); // peers
 }
 
+void MirrorInfo::generate_test_instances(std::list<MirrorInfo*>& ls) {
+  ls.push_back(new MirrorInfo());
+  ls.push_back(new MirrorInfo());
+  ls.back()->mirrored = true;
+  ls.back()->peers.insert(Peer());
+  ls.back()->peers.insert(Peer());
+}
+
 void MirrorInfo::print(std::ostream& out) const {
   out << "[peers=" << peers << "]" << std::endl;
 }
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
index 518d6273e44..49f1b48d696 100644
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -169,6 +169,7 @@ struct MirrorInfo {
   Peers peers;
 
   void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<MirrorInfo*>& ls);
   void print(std::ostream& out) const;
 
   void encode(ceph::buffer::list &bl) const;
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index c433c77b453..eb2b529dcfa 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -960,17 +960,15 @@ void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_p
     dir->set_dir_auth(auth);
     
     // move items nested beneath me, under me.
-    set<CDir*>::iterator p = subtrees[root].begin();
+    auto p = subtrees[root].begin();
     while (p != subtrees[root].end()) {
-      set<CDir*>::iterator next = p;
-      ++next;
       if (get_subtree_root((*p)->get_parent_dir()) == dir) {
 	// move under me
 	dout(10) << "  claiming child bound " << **p << dendl;
 	subtrees[dir].insert(*p); 
-	subtrees[root].erase(p);
-      }
-      p = next;
+	p = subtrees[root].erase(p);
+      } else
+	++p;
     }
     
     // i am a bound of the parent subtree.
@@ -1113,17 +1111,15 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, m
     dir->set_dir_auth(auth);
     
     // move items nested beneath me, under me.
-    set<CDir*>::iterator p = subtrees[root].begin();
+    auto p = subtrees[root].begin();
     while (p != subtrees[root].end()) {
-      set<CDir*>::iterator next = p;
-      ++next;
       if (get_subtree_root((*p)->get_parent_dir()) == dir) {
 	// move under me
 	dout(10) << "  claiming child bound " << **p << dendl;
 	subtrees[dir].insert(*p); 
-	subtrees[root].erase(p);
-      }
-      p = next;
+	p = subtrees[root].erase(p);
+      } else
+	++p;
     }
     
     // i am a bound of the parent subtree.
@@ -1172,8 +1168,8 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, m
   }
   // merge stray bounds?
   while (!subtrees[dir].empty()) {
-    set<CDir*> copy = subtrees[dir];
-    for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
+    const auto copy = subtrees[dir];
+    for (auto p = copy.begin(); p != copy.end(); ++p) {
       if (bounds.count(*p) == 0) {
 	CDir *stray = *p;
 	dout(10) << "  swallowing extra subtree at " << *stray << dendl;
@@ -1214,7 +1210,7 @@ void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir
   }
   dout(10) << " by ino: " << byino << dendl;
 
-  for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
+  for (auto p = byino.begin(); p != byino.end(); ++p) {
     p->second.simplify();
     CInode *diri = get_inode(p->first);
     if (!diri)
@@ -1222,7 +1218,7 @@ void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir
     dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
 
     fragtree_t tmpdft;
-    for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
+    for (auto q = p->second.begin(); q != p->second.end(); ++q)
       tmpdft.force_to_leaf(g_ceph_context, *q);
 
     for (const auto& fg : p->second) {
@@ -1267,7 +1263,7 @@ void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
     ino_fragset[df.ino].insert_raw(df.frag);
   }
   // get frags
-  for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
+  for (auto p = ino_fragset.begin();
        p != ino_fragset.end();
        ++p) {
     p->second.simplify();
@@ -1347,7 +1343,7 @@ void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
   } else {
     // find them
     CDir *root = get_subtree_root(dir);
-    for (set<CDir*>::iterator p = subtrees[root].begin();
+    for (auto p = subtrees[root].begin();
 	 p != subtrees[root].end();
 	 ++p) {
       CDir *t = *p;
@@ -1415,7 +1411,7 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
   CDir *newdir = diri->get_parent_dir();
 
   if (pop) {
-    map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
+    auto p = projected_subtree_renames.find(diri);
     ceph_assert(p != projected_subtree_renames.end());
     ceph_assert(!p->second.empty());
     ceph_assert(p->second.front().first == olddir);
@@ -1815,7 +1811,7 @@ void MDCache::project_rstat_inode_to_frag(const MutationRef& mut,
 
   if (cur->last != CEPH_NOSNAP) {
     ceph_assert(cur->dirty_old_rstats.empty());
-    set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
+    auto q = snaps.lower_bound(std::max(first, floor));
     if (q == snaps.end() || *q > cur->last)
       return;
   }
@@ -2487,7 +2483,7 @@ void MDCache::logged_leader_update(metareqid_t reqid)
  */
 void MDCache::finish_committed_leaders()
 {
-  for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
+  for (auto p = uncommitted_leaders.begin();
        p != uncommitted_leaders.end();
        ++p) {
     p->second.recovering = false;
@@ -2536,16 +2532,16 @@ void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag
 				      map<dirfrag_t,vector<dirfrag_t> >& subtrees)
 {
   if (subtrees.count(oldparent)) {
-      vector<dirfrag_t>& v = subtrees[oldparent];
+      auto& v = subtrees[oldparent];
       dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
-      for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
+      for (auto it = v.begin(); it != v.end(); ++it)
 	if (*it == df) {
 	  v.erase(it);
 	  break;
 	}
     }
   if (subtrees.count(newparent)) {
-    vector<dirfrag_t>& v = subtrees[newparent];
+    auto& v = subtrees[newparent];
     dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
     v.push_back(df);
   }
@@ -2766,7 +2762,7 @@ void MDCache::send_peer_resolves()
   map<mds_rank_t, ref_t<MMDSResolve>> resolves;
 
   if (mds->is_resolve()) {
-    for (map<metareqid_t, upeer>::iterator p = uncommitted_peers.begin();
+    for (auto p = uncommitted_peers.begin();
 	 p != uncommitted_peers.end();
 	 ++p) {
       mds_rank_t leader = p->second.leader;
@@ -2777,7 +2773,7 @@ void MDCache::send_peer_resolves()
   } else {
     set<mds_rank_t> resolve_set;
     mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
-    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+    for (auto p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
       MDRequestRef& mdr = p->second;
@@ -2828,7 +2824,7 @@ void MDCache::send_subtree_resolves()
   }
 
   map<mds_rank_t, ref_t<MMDSResolve>> resolves;
-  for (set<mds_rank_t>::iterator p = recovery_set.begin();
+  for (auto p = recovery_set.begin();
        p != recovery_set.end();
        ++p) {
     if (*p == mds->get_nodeid())
@@ -2841,7 +2837,7 @@ void MDCache::send_subtree_resolves()
   map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
 
   // known
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -2858,7 +2854,7 @@ void MDCache::send_subtree_resolves()
       set<CDir*> bounds;
       get_subtree_bounds(dir, bounds);
       vector<dirfrag_t> dfls;
-      for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
+      for (auto q = bounds.begin(); q != bounds.end(); ++q)
 	dfls.push_back((*q)->dirfrag());
 
       my_ambig_imports[dir->dirfrag()] = dfls;
@@ -2870,7 +2866,7 @@ void MDCache::send_subtree_resolves()
       }
       // bounds too
       vector<dirfrag_t> dfls;
-      for (set<CDir*>::iterator q = subtrees[dir].begin();
+      for (auto q = subtrees[dir].begin();
 	   q != subtrees[dir].end();
 	   ++q) {
 	CDir *bound = *q;
@@ -2883,7 +2879,7 @@ void MDCache::send_subtree_resolves()
   }
 
   // ambiguous
-  for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+  for (auto p = my_ambiguous_imports.begin();
        p != my_ambiguous_imports.end();
        ++p) {
     my_ambig_imports[p->first] = p->second;
@@ -2896,9 +2892,9 @@ void MDCache::send_subtree_resolves()
     while (i < p->second.size()) {
       dirfrag_t b = p->second[i];
       if (my_subtrees.count(b)) {
-	vector<dirfrag_t>& bb = my_subtrees[b];
+	auto& bb = my_subtrees[b];
 	dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
-	for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
+	for (auto r = bb.begin(); r != bb.end(); ++r)
 	  p->second.push_back(*r);
 	my_subtrees.erase(b);
 	p->second.erase(p->second.begin() + i);
@@ -2963,7 +2959,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
 
   // clean up any requests peer to/from this node
   list<MDRequestRef> finish;
-  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+  for (auto p = active_requests.begin();
        p != active_requests.end();
        ++p) {
     MDRequestRef& mdr = p->second;
@@ -3061,7 +3057,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
     }
   }
 
-  for (map<metareqid_t, uleader>::iterator p = uncommitted_leaders.begin();
+  for (auto p = uncommitted_leaders.begin();
        p != uncommitted_leaders.end();
        ++p) {
     // The failed MDS may have already committed the peer update
@@ -3080,7 +3076,7 @@ void MDCache::handle_mds_failure(mds_rank_t who)
   kick_find_ino_peers(who);
   kick_open_ino_peers(who);
 
-  for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+  for (auto p = fragments.begin();
        p != fragments.end(); ) {
     dirfrag_t df = p->first;
     fragment_info_t& info = p->second;
@@ -3089,18 +3085,17 @@ void MDCache::handle_mds_failure(mds_rank_t who)
       if (info.notify_ack_waiting.erase(who) &&
 	  info.notify_ack_waiting.empty()) {
 	fragment_drop_locks(info);
-	fragment_maybe_finish(p++);
+	p = fragment_maybe_finish(p);
       } else {
 	++p;
       }
       continue;
     }
 
-    ++p;
     dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
     std::vector<CDir*> dirs;
     info.dirs.swap(dirs);
-    fragments.erase(df);
+    p = fragments.erase(p);
     fragment_unmark_unfreeze_dirs(dirs);
   }
 
@@ -3126,7 +3121,7 @@ void MDCache::handle_mds_recovery(mds_rank_t who)
   MDSContext::vec waiters;
 
   // wake up any waiters in their subtrees
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -3241,7 +3236,7 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	  map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
 	  ceph_assert(get_inode(ino));
 
-	  for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
+	  for (auto q = cap_exports.begin();
 	      q != cap_exports.end();
 	      ++q) {
 	    Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
@@ -3283,10 +3278,8 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
   if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
     survivor = true;
     // check for any import success/failure (from this node)
-    map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
+    auto p = my_ambiguous_imports.begin();
     while (p != my_ambiguous_imports.end()) {
-      map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
-      ++next;
       CDir *dir = get_dirfrag(p->first);
       ceph_assert(dir);
       dout(10) << "checking ambiguous import " << *dir << dendl;
@@ -3305,7 +3298,7 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	  bool inside = true;
 	  set<CDir*> bounds;
 	  get_force_dirfrag_bound_set(q.second, bounds);
-	  for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+	  for (auto p = bounds.begin(); p != bounds.end(); ++p) {
 	    CDir *bound = *p;
 	    if (bound->contains(dir)) {
 	      inside = false;  // nope, bound is dir or parent of dir, not inside.
@@ -3316,7 +3309,7 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	    claimed_by_sender = true;
 	}
 
-	my_ambiguous_imports.erase(p);  // no longer ambiguous.
+	p = my_ambiguous_imports.erase(p);  // no longer ambiguous.
 	if (claimed_by_sender) {
 	  dout(7) << "ambiguous import failed on " << *dir << dendl;
 	  migrator->import_reverse(dir);
@@ -3324,8 +3317,8 @@ void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
 	  dout(7) << "ambiguous import succeeded on " << *dir << dendl;
 	  migrator->import_finish(dir, true);
 	}
-      }
-      p = next;
+      } else
+	++p;
     }
   }    
 
@@ -3507,9 +3500,9 @@ void MDCache::add_uncommitted_peer(metareqid_t reqid, LogSegment *ls, mds_rank_t
   if (su == nullptr) {
     return;
   }
-  for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
+  for(auto p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
     uncommitted_peer_rename_olddir[*p]++;
-  for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
+  for(auto p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
     uncommitted_peer_unlink[*p]++;
 }
 
@@ -3533,9 +3526,9 @@ void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
     return;
   }
   // discard the non-auth subtree we renamed out of
-  for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
+  for(auto p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
     CInode *diri = *p;
-    map<CInode*, int>::iterator it = uncommitted_peer_rename_olddir.find(diri);
+    auto it = uncommitted_peer_rename_olddir.find(diri);
     ceph_assert(it != uncommitted_peer_rename_olddir.end());
     it->second--;
     if (it->second == 0) {
@@ -3553,9 +3546,9 @@ void MDCache::finish_uncommitted_peer(metareqid_t reqid, bool assert_exist)
       ceph_assert(it->second > 0);
   }
   // removed the inodes that were unlinked by peer update
-  for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
+  for(auto p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
     CInode *in = *p;
-    map<CInode*, int>::iterator it = uncommitted_peer_unlink.find(in);
+    auto it = uncommitted_peer_unlink.find(in);
     ceph_assert(it != uncommitted_peer_unlink.end());
     it->second--;
     if (it->second == 0) {
@@ -3598,13 +3591,13 @@ void MDCache::disambiguate_other_imports()
 
   bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
   // other nodes' ambiguous imports
-  for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
+  for (auto p = other_ambiguous_imports.begin();
        p != other_ambiguous_imports.end();
        ++p) {
     mds_rank_t who = p->first;
     dout(10) << "ambiguous imports for mds." << who << dendl;
 
-    for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
+    for (auto q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
       dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
@@ -3639,7 +3632,7 @@ void MDCache::disambiguate_my_imports()
   // my ambiguous imports
   mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
   while (!my_ambiguous_imports.empty()) {
-    map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
+    auto q = my_ambiguous_imports.begin();
 
     CDir *dir = get_dirfrag(q->first);
     ceph_assert(dir);
@@ -3667,7 +3660,7 @@ void MDCache::disambiguate_my_imports()
   mds->mdlog->flush();
 
   // verify all my subtrees are unambiguous!
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -3692,7 +3685,7 @@ void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
 {
   // make a list
   vector<dirfrag_t> binos;
-  for (set<CDir*>::iterator p = bounds.begin();
+  for (auto p = bounds.begin();
        p != bounds.end();
        ++p) 
     binos.push_back((*p)->dirfrag());
@@ -3849,14 +3842,14 @@ void MDCache::recalc_auth_bits(bool replay)
   }
 
   set<CInode*> subtree_inodes;
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     if (p->first->dir_auth.first == mds->get_nodeid())
       subtree_inodes.insert(p->first->inode);
   }
 
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     if (p->first->inode->is_mdsdir()) {
@@ -4079,7 +4072,7 @@ void MDCache::rejoin_send_rejoins()
 	  ++q;
 	} else {
 	  // remove reconnect with no session
-	  p.second.second.erase(q++);
+	  q = p.second.second.erase(q);
 	}
       }
       rejoins[target]->cap_exports[p.first] = p.second.second;
@@ -4096,7 +4089,7 @@ void MDCache::rejoin_send_rejoins()
   
   
   // check all subtrees
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -4166,7 +4159,7 @@ void MDCache::rejoin_send_rejoins()
   if (!mds->is_rejoin()) {
     // i am survivor.  send strong rejoin.
     // note request remote_auth_pins, xlocks
-    for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+    for (auto p = active_requests.begin();
 	 p != active_requests.end();
 	 ++p) {
       MDRequestRef& mdr = p->second;
@@ -4582,7 +4575,7 @@ void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
   }
   
   // weak base inodes?  (root, stray, etc.)
-  for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
+  for (auto p = weak->weak_inodes.begin();
        p != weak->weak_inodes.end();
        ++p) {
     CInode *in = get_inode(*p);
@@ -4616,7 +4609,7 @@ void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
     rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
     mds->send_message(ack, weak->get_connection());
 
-    for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+    for (auto p = gather_locks.begin(); p != gather_locks.end(); ++p) {
       if (!(*p)->is_stable())
 	mds->locker->eval_gather(*p);
     }
@@ -5184,12 +5177,12 @@ void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
   auto bp = ack->imported_caps.cbegin();
   decode(peer_imported, bp);
 
-  for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
+  for (auto p = peer_imported.begin();
        p != peer_imported.end();
        ++p) {
     auto& ex = cap_exports.at(p->first);
     ceph_assert(ex.first == from);
-    for (map<client_t,Capability::Import>::iterator q = p->second.begin();
+    for (auto q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
       auto r = ex.second.find(q->first);
@@ -5271,7 +5264,7 @@ void MDCache::rejoin_trim_undef_inodes()
   dout(10) << "rejoin_trim_undef_inodes" << dendl;
 
   while (!rejoin_undef_inodes.empty()) {
-    set<CInode*>::iterator p = rejoin_undef_inodes.begin();
+    auto p = rejoin_undef_inodes.begin();
     CInode *in = *p;
     rejoin_undef_inodes.erase(p);
 
@@ -5496,12 +5489,12 @@ bool MDCache::process_imported_caps()
     }
 
     // process caps that were exported by peer rename
-    for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_peer_exports.begin();
+    for (auto p = rejoin_peer_exports.begin();
 	 p != rejoin_peer_exports.end();
 	 ++p) {
       CInode *in = get_inode(p->first);
       ceph_assert(in);
-      for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
+      for (auto q = p->second.second.begin();
 	   q != p->second.second.end();
 	   ++q) {
 	auto r = rejoin_session_map.find(q->first);
@@ -5568,7 +5561,7 @@ bool MDCache::process_imported_caps()
 	  }
 	}
       }
-      cap_imports.erase(p++);  // remove and move on
+      p = cap_imports.erase(p);  // remove and move on
     }
   } else {
     trim_non_auth();
@@ -5690,7 +5683,7 @@ void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
 
   for (auto p = realm->inodes_with_caps.begin(); !p.end(); ++p)
     split_inos.push_back((*p)->ino());
-  for (set<SnapRealm*>::iterator p = realm->open_children.begin();
+  for (auto p = realm->open_children.begin();
        p != realm->open_children.end();
        ++p)
     split_realms.push_back((*p)->inode->ino());
@@ -5737,12 +5730,12 @@ void MDCache::clean_open_file_lists()
 {
   dout(10) << "clean_open_file_lists" << dendl;
   
-  for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+  for (auto p = mds->mdlog->segments.begin();
        p != mds->mdlog->segments.end();
        ++p) {
     LogSegment *ls = p->second;
 
-    elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
+    auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
     while (!q.end()) {
       CInode *in = *q;
       ++q;
@@ -5828,7 +5821,7 @@ void MDCache::export_remaining_imported_caps()
       mds->heartbeat_reset();
   }
 
-  for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
+  for (auto p = cap_reconnect_waiters.begin();
        p != cap_reconnect_waiters.end();
        ++p)
     mds->queue_waiters(p->second);
@@ -5869,7 +5862,7 @@ Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
       dout(15) << " chose lock states on " << *in << dendl;
     }
 
-    map<inodeno_t, MDSContext::vec >::iterator it =
+    auto it =
       cap_reconnect_waiters.find(in->ino());
     if (it != cap_reconnect_waiters.end()) {
       mds->queue_waiters(it->second);
@@ -5956,7 +5949,7 @@ void MDCache::open_snaprealms()
       }
     }
 
-    rejoin_pending_snaprealms.erase(it++);
+    it = rejoin_pending_snaprealms.erase(it);
     in->put(CInode::PIN_OPENINGSNAPPARENTS);
 
     send_snaps(splits);
@@ -6094,10 +6087,10 @@ void MDCache::rejoin_send_acks()
   dout(7) << "rejoin_send_acks" << dendl;
 
   // replicate stray
-  for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
+  for (auto p = rejoin_unlinked_inodes.begin();
        p != rejoin_unlinked_inodes.end();
        ++p) {
-    for (set<CInode*>::iterator q = p->second.begin();
+    for (auto q = p->second.begin();
 	 q != p->second.end();
 	 ++q) {
       CInode *in = *q;
@@ -6127,7 +6120,7 @@ void MDCache::rejoin_send_acks()
   
   // send acks to everyone in the recovery set
   map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
-  for (set<mds_rank_t>::iterator p = recovery_set.begin();
+  for (auto p = recovery_set.begin();
        p != recovery_set.end();
        ++p) {
     if (rejoin_ack_sent.count(*p))
@@ -6138,7 +6131,7 @@ void MDCache::rejoin_send_acks()
   rejoin_ack_sent = recovery_set;
   
   // walk subtrees
-  for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin(); 
+  for (auto p = subtrees.begin(); 
        p != subtrees.end();
        ++p) {
     CDir *dir = p->first;
@@ -6236,7 +6229,7 @@ void MDCache::rejoin_send_acks()
     }
 
   // include inode base for any inodes whose scatterlocks may have updated
-  for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
+  for (auto p = rejoin_potential_updated_scatterlocks.begin();
        p != rejoin_potential_updated_scatterlocks.end();
        ++p) {
     CInode *in = *p;
@@ -6663,7 +6656,7 @@ void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
 {
   dout(10) << "truncate_inode_finish " << *in << dendl;
   
-  set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+  auto p = ls->truncating_inodes.find(in);
   ceph_assert(p != ls->truncating_inodes.end());
   ls->truncating_inodes.erase(p);
 
@@ -6719,7 +6712,7 @@ void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
   dout(20) << "remove_recovered_truncate " << *in << " in log segment "
 	   << ls->seq << "/" << ls->offset << dendl;
   // if we have the logseg the truncate started in, it must be in our list.
-  set<CInode*>::iterator p = ls->truncating_inodes.find(in);
+  auto p = ls->truncating_inodes.find(in);
   ceph_assert(p != ls->truncating_inodes.end());
   ls->truncating_inodes.erase(p);
   in->put(CInode::PIN_TRUNCATING);
@@ -6728,11 +6721,11 @@ void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
 void MDCache::start_recovered_truncates()
 {
   dout(10) << "start_recovered_truncates" << dendl;
-  for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
+  for (auto p = mds->mdlog->segments.begin();
        p != mds->mdlog->segments.end();
        ++p) {
     LogSegment *ls = p->second;
-    for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
+    for (auto q = ls->truncating_inodes.begin();
 	 q != ls->truncating_inodes.end();
 	 ++q) {
       CInode *in = *q;
@@ -7006,7 +6999,7 @@ std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
 
   // Other rank's base inodes (when I'm stopping)
   if (mds->is_stopping()) {
-    for (set<CInode*>::iterator p = base_inodes.begin();
+    for (auto p = base_inodes.begin();
          p != base_inodes.end();) {
       CInode *base_in = *p;
       ++p;
@@ -7278,7 +7271,7 @@ void MDCache::trim_non_auth()
   dout(7) << "trim_non_auth" << dendl;
   
   // temporarily pin all subtree roots
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) 
     p->first->get(CDir::PIN_SUBTREETEMP);
@@ -7349,7 +7342,7 @@ void MDCache::trim_non_auth()
   lru.lru_touch_entire_pintail();
 
   // unpin all subtrees
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) 
     p->first->put(CDir::PIN_SUBTREETEMP);
@@ -7461,7 +7454,7 @@ void MDCache::try_trim_non_auth_subtree(CDir *dir)
   // can we now trim child subtrees?
   set<CDir*> bounds;
   get_subtree_bounds(dir, bounds);
-  for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
+  for (auto p = bounds.begin(); p != bounds.end(); ++p) {
     CDir *bd = *p;
     if (bd->get_dir_auth().first != mds->get_nodeid() &&  // we are not auth
 	bd->get_num_any() == 0 && // and empty
@@ -7746,7 +7739,7 @@ void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
     }
   }
 
-  for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
+  for (auto p = gather_locks.begin(); p != gather_locks.end(); ++p) {
     if (!(*p)->is_stable())
       mds->locker->eval_gather(*p);
   }
@@ -9292,7 +9285,7 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
       info.auth_hint = MDS_RANK_NONE;
     }
   } else {
-    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+    for (auto p = active.begin(); p != active.end(); ++p)
       if (*p != whoami && info.checked.count(*p) == 0) {
 	peer = *p;
 	break;
@@ -9405,7 +9398,7 @@ void MDCache::kick_open_ino_peers(mds_rank_t who)
 {
   dout(10) << "kick_open_ino_peers mds." << who << dendl;
 
-  for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
+  for (auto p = opening_inodes.begin();
        p != opening_inodes.end();
        ++p) {
     open_ino_info_t& info = p->second;
@@ -9546,7 +9539,7 @@ void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
     m = fip.hint;
     fip.hint = MDS_RANK_NONE;
   } else {
-    for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
+    for (auto p = active.begin(); p != active.end(); ++p)
       if (*p != mds->get_nodeid() &&
 	  fip.checked.count(*p) == 0) {
 	m = *p;
@@ -9645,7 +9638,7 @@ void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
 void MDCache::kick_find_ino_peers(mds_rank_t who)
 {
   // find_ino_peers requests we should move on from
-  for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
+  for (auto p = find_ino_peer.begin();
        p != find_ino_peer.end();
        ++p) {
     find_ino_peer_info_t& fip = p->second;
@@ -9665,7 +9658,7 @@ void MDCache::kick_find_ino_peers(mds_rank_t who)
 int MDCache::get_num_client_requests()
 {
   int count = 0;
-  for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
+  for (auto p = active_requests.begin();
       p != active_requests.end();
       ++p) {
     MDRequestRef& mdr = p->second;
@@ -9766,7 +9759,7 @@ MDRequestRef MDCache::request_start_internal(int op)
 
 MDRequestRef MDCache::request_get(metareqid_t rid)
 {
-  ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
+  auto p = active_requests.find(rid);
   ceph_assert(p != active_requests.end());
   dout(7) << "request_get " << rid << " " << *p->second << dendl;
   return p->second;
@@ -10435,7 +10428,7 @@ void MDCache::discover_path(CDir *base,
 
 void MDCache::kick_discovers(mds_rank_t who)
 {
-  for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
+  for (auto p = discovers.begin();
        p != discovers.end();
        ++p) {
     if (p->second.mds != who)
@@ -10772,7 +10765,7 @@ void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
 
   // decrement discover counters
   if (m->get_tid()) {
-    map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
+    auto p = discovers.find(m->get_tid());
     if (p != discovers.end()) {
       dout(10) << " found tid " << m->get_tid() << dendl;
       discovers.erase(p);
@@ -11178,7 +11171,7 @@ int MDCache::send_dir_updates(CDir *dir, bool bcast)
   }
 
   mds_rank_t whoami = mds->get_nodeid();
-  for (set<mds_rank_t>::iterator it = who.begin();
+  for (auto it = who.begin();
        it != who.end();
        ++it) {
     if (*it == whoami) continue;
@@ -11351,7 +11344,7 @@ void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, const MDRequestR
     CInode *strayin = straydn->get_linkage()->get_inode();
     strayin->encode_snap_blob(snapbl);
   }
-  for (set<mds_rank_t>::iterator it = replicas.begin();
+  for (auto it = replicas.begin();
        it != replicas.end();
        ++it) {
     // don't tell (rmdir) witnesses; they already know
@@ -11588,7 +11581,7 @@ void MDCache::adjust_dir_fragments(CInode *diri,
       set<CDir*> bounds;
       bounds.swap(subtrees[dir]);
       subtrees.erase(dir);
-      for (set<CDir*>::iterator p = bounds.begin();
+      for (auto p = bounds.begin();
 	   p != bounds.end();
 	   ++p) {
 	CDir *frag = get_subtree_root((*p)->get_parent_dir());
@@ -11627,11 +11620,11 @@ void MDCache::adjust_dir_fragments(CInode *diri,
       for (const auto& dir : srcfrags) {
 	ceph_assert(dir->is_subtree_root());
 	dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
-	map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
-	set<CDir*>::iterator r = q->second.begin();
+	auto q = subtrees.find(dir);
+	auto r = q->second.begin();
 	while (r != subtrees[dir].end()) {
 	  new_bounds.insert(*r);
-	  subtrees[dir].erase(r++);
+	  r = subtrees[dir].erase(r);
 	}
 	subtrees.erase(q);
 
@@ -11835,7 +11828,7 @@ public:
 void MDCache::fragment_mark_and_complete(const MDRequestRef& mdr)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  auto it = fragments.find(basedirfrag);
   if (it == fragments.end() || it->second.mdr != mdr) {
     dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
     request_finish(mdr);
@@ -11938,8 +11931,7 @@ void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
 bool MDCache::fragment_are_all_frozen(CDir *dir)
 {
   ceph_assert(dir->is_frozen_dir());
-  map<dirfrag_t,fragment_info_t>::iterator p;
-  for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+  for (auto p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
        p != fragments.end() && p->first.ino == dir->ino();
        ++p) {
     if (p->first.frag.contains(dir->get_frag()))
@@ -11951,8 +11943,7 @@ bool MDCache::fragment_are_all_frozen(CDir *dir)
 
 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
 {
-  map<dirfrag_t,fragment_info_t>::iterator p;
-  for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
+  for (auto p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
        p != fragments.end() && p->first.ino == dir->ino();
        ++p) {
     if (p->first.frag.contains(dir->get_frag())) {
@@ -11971,7 +11962,7 @@ void MDCache::find_stale_fragment_freeze()
   utime_t cutoff = now;
   cutoff -= g_conf()->mds_freeze_tree_timeout;
 
-  for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
+  for (auto p = fragments.begin();
        p != fragments.end(); ) {
     dirfrag_t df = p->first;
     fragment_info_t& info = p->second;
@@ -12060,7 +12051,7 @@ public:
 void MDCache::fragment_frozen(const MDRequestRef& mdr, int r)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  auto it = fragments.find(basedirfrag);
   if (it == fragments.end() || it->second.mdr != mdr || r < 0) {
     dout(7) << "fragment_frozen " << basedirfrag << " must have aborted; rc=" << r << dendl;
     request_finish(mdr);
@@ -12079,7 +12070,7 @@ void MDCache::fragment_frozen(const MDRequestRef& mdr, int r)
 void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freezing)
 {
   dirfrag_t basedirfrag = mdr->more()->fragment_base;
-  map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
+  auto it = fragments.find(basedirfrag);
   if (it == fragments.end() || it->second.mdr != mdr) {
     dout(7) << __func__ << ": " << basedirfrag << " must have aborted" << dendl;
     request_finish(mdr);
@@ -12402,12 +12393,12 @@ void MDCache::fragment_drop_locks(fragment_info_t& info)
   //info.mdr.reset();
 }
 
-void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
+MDCache::fragment_info_iterator MDCache::fragment_maybe_finish(const fragment_info_iterator it)
 {
   ceph_assert(kill_dirfrag_at != dirfrag_killpoint::FRAGMENT_MAYBE_FINISH);
 
   if (!it->second.finishing)
-    return;
+    return it;
 
   // unmark & auth_unpin
   for (const auto &dir : it->second.resultfrags) {
@@ -12421,7 +12412,7 @@ void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
     mds->balancer->maybe_fragment(dir, false);
   }
 
-  fragments.erase(it);
+  return fragments.erase(it);
 }
 
 
@@ -12522,7 +12513,7 @@ void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
 {
   dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
 	   << " op " << EFragment::op_name(op) << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  auto it = uncommitted_fragments.find(basedirfrag);
   if (it != uncommitted_fragments.end()) {
     ufragment& uf = it->second;
     if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
@@ -12539,7 +12530,7 @@ void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&&
 {
   dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
            << " old_frags (" << old_frags << ")" << dendl;
-  map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
+  auto it = uncommitted_fragments.find(basedirfrag);
   if (it != uncommitted_fragments.end()) {
     ufragment& uf = it->second;
     if (!uf.old_frags.empty()) {
@@ -12575,7 +12566,7 @@ struct C_MDC_FragmentRollback : public MDCacheLogContext {
 void MDCache::rollback_uncommitted_fragments()
 {
   dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
-  for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
+  for (auto p = uncommitted_fragments.begin();
        p != uncommitted_fragments.end();
        ++p) {
     ufragment &uf = p->second;
@@ -12722,7 +12713,7 @@ void MDCache::show_subtrees(int dbl, bool force_print)
 
   // root frags
   std::vector<CDir*> basefrags;
-  for (set<CInode*>::iterator p = base_inodes.begin();
+  for (auto p = base_inodes.begin();
        p != base_inodes.end();
        ++p) 
     (*p)->get_dirfrags(basefrags);
@@ -12760,13 +12751,11 @@ void MDCache::show_subtrees(int dbl, bool force_print)
     seen.insert(dir);
 
     // nested items?
-    if (!subtrees[dir].empty()) {
-      for (set<CDir*>::iterator p = subtrees[dir].begin();
-	   p != subtrees[dir].end();
-	   ++p) {
-	//dout(25) << " saw sub " << **p << dendl;
-	q.push_front(pair<CDir*,int>(*p, d+1));
-      }
+    for (auto p = subtrees[dir].begin();
+	 p != subtrees[dir].end();
+	 ++p) {
+      //dout(25) << " saw sub " << **p << dendl;
+      q.push_front(pair<CDir*,int>(*p, d+1));
     }
   }
 
@@ -12831,7 +12820,7 @@ void MDCache::show_subtrees(int dbl, bool force_print)
       else
 	indent += "  ";
 
-      for (set<CDir*>::iterator p = subtrees[dir].begin();
+      for (auto p = subtrees[dir].begin();
 	   p != subtrees[dir].end();
 	   ++p) 
 	q.push_front(pair<CDir*,int>(*p, d+2));
@@ -12840,7 +12829,7 @@ void MDCache::show_subtrees(int dbl, bool force_print)
 
   // verify there isn't stray crap in subtree map
   int lost = 0;
-  for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
+  for (auto p = subtrees.begin();
        p != subtrees.end();
        ++p) {
     if (subtrees_seen.count(p->first)) continue;
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 0b01c9ab859..3c5d7e5e4f4 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -1485,7 +1485,7 @@ private:
   void fragment_frozen(const MDRequestRef& mdr, int r);
   void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
   void fragment_drop_locks(fragment_info_t &info);
-  void fragment_maybe_finish(const fragment_info_iterator& it);
+  fragment_info_iterator fragment_maybe_finish(const fragment_info_iterator it);
   void dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freezing=false);
   void _fragment_logged(const MDRequestRef& mdr);
   void _fragment_stored(const MDRequestRef& mdr);
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 8e267503ab2..c2f3544f97b 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -3107,7 +3107,7 @@ void MDSRankDispatcher::evict_clients(
   dout(20) << __func__ << " matched " << victims.size() << " sessions" << dendl;
 
   if (victims.empty()) {
-    on_finish(-ESRCH, "no hosts match", outbl);
+    on_finish(0, "no hosts match", outbl);
     return;
   }
 
diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc
index 594e8db87f8..925bff16542 100644
--- a/src/mds/PurgeQueue.cc
+++ b/src/mds/PurgeQueue.cc
@@ -99,6 +99,17 @@ void PurgeItem::decode(bufferlist::const_iterator &p)
   DECODE_FINISH(p);
 }
 
+void PurgeItem::generate_test_instances(std::list<PurgeItem*>& ls) {
+  ls.push_back(new PurgeItem());
+  ls.push_back(new PurgeItem());
+  ls.back()->action = PurgeItem::PURGE_FILE;
+  ls.back()->ino = 1;
+  ls.back()->size = 2;
+  ls.back()->layout = file_layout_t();
+  ls.back()->old_pools = {1, 2};
+  ls.back()->snapc = SnapContext();
+  ls.back()->stamp = utime_t(3, 4);
+}
 // if Objecter has any slow requests, take that as a hint and
 // slow down our rate of purging
 PurgeQueue::PurgeQueue(
diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h
index 7bc101e31c4..bbf260ae70d 100644
--- a/src/mds/PurgeQueue.h
+++ b/src/mds/PurgeQueue.h
@@ -61,6 +61,7 @@ public:
     fragtree.dump(f);
     f->close_section();
   }
+  static void generate_test_instances(std::list<PurgeItem*>& ls);
 
   std::string_view get_type_str() const;
 
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 65ac6f17b43..cf286b46d46 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4464,7 +4464,6 @@ void Server::_lookup_ino_2(const MDRequestRef& mdr, int r)
 }
 
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_open(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -4702,7 +4701,6 @@ bool Server::is_valid_layout(file_layout_t *layout)
   return true;
 }
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_openc(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -7169,7 +7167,6 @@ void Server::handle_client_mknod(const MDRequestRef& mdr)
 
 
 // MKDIR
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_mkdir(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -8767,8 +8764,6 @@ public:
  * all other nodes have also replciated destdn and straydn.  note that
  * destdn replicas need not also replicate srci.  this only works when 
  * destdn is leader.
- *
- * This function takes responsibility for the passed mdr.
  */
 void Server::handle_client_rename(const MDRequestRef& mdr)
 {
@@ -10913,7 +10908,6 @@ void Server::_peer_rename_sessions_flushed(const MDRequestRef& mdr)
 }
 
 // snaps
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_lssnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -11023,7 +11017,6 @@ struct C_MDS_mksnap_finish : public ServerLogContext {
   }
 };
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_mksnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -11220,7 +11213,6 @@ struct C_MDS_rmsnap_finish : public ServerLogContext {
   }
 };
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_rmsnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
@@ -11350,7 +11342,6 @@ struct C_MDS_renamesnap_finish : public ServerLogContext {
   }
 };
 
-/* This function takes responsibility for the passed mdr*/
 void Server::handle_client_renamesnap(const MDRequestRef& mdr)
 {
   const cref_t<MClientRequest> &req = mdr->client_request;
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index 623f20a0eb7..9e82f00a9bf 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -574,7 +574,6 @@ public:
   }
 
   static void generate_test_instances(std::list<SessionMapStore*>& ls);
-
   void reset_state()
   {
     session_map.clear();
diff --git a/src/mds/SimpleLock.cc b/src/mds/SimpleLock.cc
index da266e30dab..df61384a3ca 100644
--- a/src/mds/SimpleLock.cc
+++ b/src/mds/SimpleLock.cc
@@ -43,6 +43,13 @@ void SimpleLock::dump(ceph::Formatter *f) const {
   f->close_section();
 }
 
+void SimpleLock::generate_test_instances(std::list<SimpleLock*>& ls) {
+  ls.push_back(new SimpleLock);
+  ls.push_back(new SimpleLock);
+  ls.back()->set_state(LOCK_SYNC);
+}
+
+
 int SimpleLock::get_wait_shift() const {
   switch (get_type()) {
     case CEPH_LOCK_DN:       return 0;
diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
index 6f1d049ea0a..55621549a8f 100644
--- a/src/mds/SimpleLock.h
+++ b/src/mds/SimpleLock.h
@@ -175,6 +175,12 @@ public:
     }
   }
 
+  //for dencoder only
+  SimpleLock() :
+    type(nullptr),
+    parent(nullptr)
+  {}
+
   SimpleLock(MDSCacheObject *o, const LockType *lt) :
     type(lt),
     parent(o)
@@ -199,8 +205,8 @@ public:
 
   // parent
   MDSCacheObject *get_parent() { return parent; }
-  int get_type() const { return type->type; }
-  const sm_t* get_sm() const { return type->sm; }
+  int get_type() const { return (type != nullptr) ? type->type : 0; }
+  const sm_t* get_sm() const { return (type != nullptr) ? type->sm : nullptr; }
 
   int get_cap_shift() const;
   int get_cap_mask() const;
@@ -493,6 +499,7 @@ public:
       encode(empty_gather_set, bl);
     ENCODE_FINISH(bl);
   }
+  
   void decode(ceph::buffer::list::const_iterator& p) {
     DECODE_START(2, p);
     decode(state, p);
@@ -588,6 +595,7 @@ public:
    * to formatter, or nothing if is_sync_and_unlocked.
    */
   void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SimpleLock*>& ls);
 
   virtual void print(std::ostream& out) const {
     out << "(";
diff --git a/src/mds/flock.cc b/src/mds/flock.cc
index 69d579d3034..c126b0f0898 100644
--- a/src/mds/flock.cc
+++ b/src/mds/flock.cc
@@ -37,6 +37,50 @@ ceph_lock_state_t::~ceph_lock_state_t()
   }
 }
 
+void ceph_lock_state_t::dump(ceph::Formatter *f) const {
+  f->dump_int("type", type);
+  f->dump_int("held_locks", held_locks.size());
+  for (auto &p : held_locks) {
+    f->open_object_section("lock");
+    f->dump_int("start", p.second.start);
+    f->dump_int("length", p.second.length);
+    f->dump_int("client", p.second.client);
+    f->dump_int("owner", p.second.owner);
+    f->dump_int("pid", p.second.pid);
+    f->dump_int("type", p.second.type);
+    f->close_section();
+  }
+  f->dump_int("waiting_locks", waiting_locks.size());
+  for (auto &p : waiting_locks) {
+    f->open_object_section("lock");
+    f->dump_int("start", p.second.start);
+    f->dump_int("length", p.second.length);
+    f->dump_int("client", p.second.client);
+    f->dump_int("owner", p.second.owner);
+    f->dump_int("pid", p.second.pid);
+    f->dump_int("type", p.second.type);
+    f->close_section();
+  }
+  f->dump_int("client_held_lock_counts", client_held_lock_counts.size());
+  for (auto &p : client_held_lock_counts) {
+    f->open_object_section("client");
+    f->dump_int("client_id", p.first.v);
+    f->dump_int("count", p.second);
+    f->close_section();
+  }
+  f->dump_int("client_waiting_lock_counts", client_waiting_lock_counts.size());
+}
+
+
+void ceph_lock_state_t::generate_test_instances(std::list<ceph_lock_state_t*>& ls) {
+  ls.push_back(new ceph_lock_state_t(NULL, 0));
+  ls.push_back(new ceph_lock_state_t(NULL, 1));
+  ls.back()->held_locks.insert(std::make_pair(1, ceph_filelock()));
+  ls.back()->waiting_locks.insert(std::make_pair(1, ceph_filelock()));
+  ls.back()->client_held_lock_counts.insert(std::make_pair(1, 1));
+  ls.back()->client_waiting_lock_counts.insert(std::make_pair(1, 1));
+}
+
 bool ceph_lock_state_t::is_waiting(const ceph_filelock &fl) const
 {
   auto p = waiting_locks.find(fl.start);
diff --git a/src/mds/flock.h b/src/mds/flock.h
index 915d912e1ee..6871f2decc5 100644
--- a/src/mds/flock.h
+++ b/src/mds/flock.h
@@ -71,6 +71,7 @@ inline bool operator!=(const ceph_filelock& l, const ceph_filelock& r) {
 class ceph_lock_state_t {
 public:
   explicit ceph_lock_state_t(CephContext *cct_, int type_) : cct(cct_), type(type_) {}
+  ceph_lock_state_t() : cct(NULL), type(0) {}
   ~ceph_lock_state_t();
   /**
    * Check if a lock is on the waiting_locks list.
@@ -132,6 +133,8 @@ public:
     decode(held_locks, bl);
     decode(client_held_lock_counts, bl);
   }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ceph_lock_state_t*>& ls);
   bool empty() const {
     return held_locks.empty() && waiting_locks.empty() &&
 	   client_held_lock_counts.empty() &&
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index fce09baef81..680218e62e3 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -528,6 +528,15 @@ void feature_bitset_t::dump(Formatter *f) const {
   f->dump_string("feature_bits", css->strv());
 }
 
+void feature_bitset_t::generate_test_instances(std::list<feature_bitset_t*>& ls)
+{
+  ls.push_back(new feature_bitset_t());
+  ls.push_back(new feature_bitset_t());
+  ls.back()->_vec.push_back(1);
+  ls.back()->_vec.push_back(2);
+  ls.back()->_vec.push_back(3);
+}
+
 void feature_bitset_t::print(ostream& out) const
 {
   std::ios_base::fmtflags f(out.flags());
@@ -564,6 +573,13 @@ void metric_spec_t::dump(Formatter *f) const {
   f->dump_object("metric_flags", metric_flags);
 }
 
+void metric_spec_t::generate_test_instances(std::list<metric_spec_t*>& ls)
+{
+  ls.push_back(new metric_spec_t());
+  ls.push_back(new metric_spec_t());
+  ls.back()->metric_flags = 1;
+}
+
 void metric_spec_t::print(ostream& out) const
 {
   out << "{metric_flags: '" << metric_flags << "'}";
@@ -601,6 +617,16 @@ void client_metadata_t::dump(Formatter *f) const
     f->dump_string(name.c_str(), val);
 }
 
+void client_metadata_t::generate_test_instances(std::list<client_metadata_t*>& ls)
+{
+  ls.push_back(new client_metadata_t());
+  ls.push_back(new client_metadata_t());
+  ls.back()->kv_map["key1"] = "val1";
+  ls.back()->kv_map["key2"] = "val2";
+  ls.back()->features = 0x12345678;
+  ls.back()->metric_spec.metric_flags = 0x12345678;
+}
+
 /*
  * session_info_t
  */
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 17a5bf7acae..3b8269006cb 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -349,6 +349,7 @@ public:
   void decode(ceph::buffer::list::const_iterator &p);
   void dump(ceph::Formatter *f) const;
   void print(std::ostream& out) const;
+  static void generate_test_instances(std::list<feature_bitset_t*>& ls);
 private:
   void init_array(const std::vector<size_t>& v);
 
@@ -387,6 +388,7 @@ struct metric_spec_t {
   void encode(ceph::buffer::list& bl) const;
   void decode(ceph::buffer::list::const_iterator& p);
   void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<metric_spec_t*>& ls);
   void print(std::ostream& out) const;
 
   // set of metrics that a client is capable of forwarding
@@ -433,6 +435,7 @@ struct client_metadata_t {
   void encode(ceph::buffer::list& bl) const;
   void decode(ceph::buffer::list::const_iterator& p);
   void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<client_metadata_t*>& ls);
 
   kv_map_t kv_map;
   feature_bitset_t features;
@@ -634,7 +637,10 @@ struct metareqid_t {
   void print(std::ostream& out) const {
     out << name << ":" << tid;
   }
-
+  static void generate_test_instances(std::list<metareqid_t*>& ls) {
+    ls.push_back(new metareqid_t);
+    ls.push_back(new metareqid_t(entity_name_t::CLIENT(123), 456));
+  }
   entity_name_t name;
   uint64_t tid = 0;
 };
@@ -786,6 +792,15 @@ struct dirfrag_t {
     decode(ino, bl);
     decode(frag, bl);
   }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("ino", ino);
+    f->dump_unsigned("frag", frag);
+  }
+  static void generate_test_instances(std::list<dirfrag_t*>& ls) {
+    ls.push_back(new dirfrag_t);
+    ls.push_back(new dirfrag_t(1, frag_t()));
+    ls.push_back(new dirfrag_t(2, frag_t(3)));
+  }
 
   inodeno_t ino = 0;
   frag_t frag;
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index b87c3153af3..c157c33e758 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -155,6 +155,25 @@ struct MDSHealthMetric
     DECODE_FINISH(bl);
   }
 
+  void dump(ceph::Formatter *f) const {
+    f->dump_string("type", mds_metric_name(type));
+    f->dump_stream("sev") << sev;
+    f->dump_string("message", message);
+    f->open_object_section("metadata");
+    for (auto& i : metadata) {
+      f->dump_string(i.first.c_str(), i.second);
+    }
+    f->close_section();
+  }
+
+  static void generate_test_instances(std::list<MDSHealthMetric*>& ls) {
+    ls.push_back(new MDSHealthMetric());
+    ls.back()->type = MDS_HEALTH_CACHE_OVERSIZED;
+    ls.push_back(new MDSHealthMetric(MDS_HEALTH_TRIM, HEALTH_WARN, "MDS is behind on trimming"));
+    ls.back()->metadata["mds"] = "a";
+    ls.back()->metadata["num"] = "1";
+  }
+
   bool operator==(MDSHealthMetric const &other) const
   {
     return (type == other.type && sev == other.sev && message == other.message);
@@ -187,6 +206,23 @@ struct MDSHealth
     DECODE_FINISH(bl);
   }
 
+  void dump(ceph::Formatter *f) const {
+    f->open_array_section("metrics");
+    for (auto& i : metrics) {
+      f->open_object_section("metric");
+      i.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  static void generate_test_instances(std::list<MDSHealth*>& ls) {
+    ls.push_back(new MDSHealth);
+    ls.push_back(new MDSHealth);
+    ls.back()->metrics.push_back(MDSHealthMetric(MDS_HEALTH_TRIM, HEALTH_WARN,
+             "MDS is behind on trimming"));
+  }
+
   bool operator==(MDSHealth const &other) const
   {
     return metrics == other.metrics;
diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h
index 187fb68f846..8538f6e236a 100644
--- a/src/mgr/ActivePyModule.h
+++ b/src/mgr/ActivePyModule.h
@@ -27,6 +27,8 @@
 #include "PyModuleRunner.h"
 #include "PyModule.h"
 
+#include <fmt/core.h>
+
 #include <vector>
 #include <string>
 
@@ -46,7 +48,6 @@ private:
 
   std::string m_command_perms;
   const MgrSession* m_session = nullptr;
-  std::string fin_thread_name;
 public:
   Finisher finisher; // per active module finisher to execute commands
 
@@ -54,8 +55,7 @@ public:
   ActivePyModule(const PyModuleRef &py_module_,
       LogChannelRef clog_)
     : PyModuleRunner(py_module_, clog_),
-      fin_thread_name(std::string("m-fin-" + py_module->get_name()).substr(0,15)),
-      finisher(g_ceph_context, thread_name, fin_thread_name)
+      finisher(g_ceph_context, thread_name, fmt::format("m-fin-{}", py_module->get_name()).substr(0,15))
 
   {
   }
@@ -97,14 +97,14 @@ public:
     uri = str;
   }
 
-  std::string get_uri() const
+  std::string_view get_uri() const
   {
     return uri;
   }
 
-  std::string get_fin_thread_name() const
+  std::string_view get_fin_thread_name() const
   {
-    return fin_thread_name;
+    return finisher.get_thread_name();
   }
 
   bool is_authorized(const std::map<std::string, std::string>& arguments) const;
diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
index 17bb3951142..aebbb5d8c9a 100644
--- a/src/mgr/ActivePyModules.cc
+++ b/src/mgr/ActivePyModules.cc
@@ -770,9 +770,9 @@ std::map<std::string, std::string> ActivePyModules::get_services() const
   std::map<std::string, std::string> result;
   std::lock_guard l(lock);
   for (const auto& [name, module] : modules) {
-    std::string svc_str = module->get_uri();
+    const std::string_view svc_str = module->get_uri();
     if (!svc_str.empty()) {
-      result[name] = svc_str;
+      result.emplace(name, svc_str);
     }
   }
 
diff --git a/src/mgr/DaemonHealthMetric.h b/src/mgr/DaemonHealthMetric.h
index a6fbce29910..2bc382dde40 100644
--- a/src/mgr/DaemonHealthMetric.h
+++ b/src/mgr/DaemonHealthMetric.h
@@ -7,6 +7,7 @@
 #include <ostream>
 #include "common/Formatter.h"
 #include "include/denc.h"
+#include "common/Formatter.h"
 
 enum class daemon_metric : uint8_t {
   SLOW_OPS,
diff --git a/src/mgr/MDSPerfMetricTypes.h b/src/mgr/MDSPerfMetricTypes.h
index aa35b8cab0f..2323afcdd1b 100644
--- a/src/mgr/MDSPerfMetricTypes.h
+++ b/src/mgr/MDSPerfMetricTypes.h
@@ -10,6 +10,7 @@
 
 #include "include/denc.h"
 #include "include/stringify.h"
+#include "common/Formatter.h"
 
 #include "mds/mdstypes.h"
 #include "mgr/Types.h"
@@ -40,7 +41,7 @@ struct MDSPerfMetricSubKeyDescriptor {
   MDSPerfMetricSubKeyDescriptor() {
   }
   MDSPerfMetricSubKeyDescriptor(MDSPerfMetricSubKeyType type, const std::string &regex_str)
-    : type(type), regex_str(regex_str) {
+      : type(type), regex_str(regex_str) {
   }
 
   bool operator<(const MDSPerfMetricSubKeyDescriptor &other) const {
@@ -59,6 +60,10 @@ struct MDSPerfMetricSubKeyDescriptor {
     denc(v.regex_str, p);
     DENC_FINISH(p);
   }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("type", static_cast<uint8_t>(type));
+    f->dump_string("regex_str", regex_str);
+  }
 };
 WRITE_CLASS_DENC(MDSPerfMetricSubKeyDescriptor)
 
@@ -77,7 +82,7 @@ struct denc_traits<MDSPerfMetricKeyDescriptor> {
     if (size) {
       size_t per = 0;
       denc(v.front(), per);
-      p +=  per * size;
+      p += per * size;
     }
   }
   static void encode(const MDSPerfMetricKeyDescriptor& v,
@@ -183,6 +188,9 @@ struct MDSPerformanceCounterDescriptor {
     denc(v.type, p);
     DENC_FINISH(p);
   }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("type", static_cast<uint8_t>(type));
+  }
 
   void pack_counter(const PerformanceCounter &c, ceph::buffer::list *bl) const;
   void unpack_counter(ceph::buffer::list::const_iterator& bl, PerformanceCounter *c) const;
@@ -204,7 +212,7 @@ struct denc_traits<MDSPerformanceCounterDescriptors> {
     if (size) {
       size_t per = 0;
       denc(v.front(), per);
-      p +=  per * size;
+      p += per * size;
     }
   }
   static void encode(const MDSPerformanceCounterDescriptors& v,
@@ -237,7 +245,7 @@ struct MDSPerfMetricLimit {
   MDSPerfMetricLimit() {
   }
   MDSPerfMetricLimit(const MDSPerformanceCounterDescriptor &order_by, uint64_t max_count)
-    : order_by(order_by), max_count(max_count) {
+      : order_by(order_by), max_count(max_count) {
   }
 
   bool operator<(const MDSPerfMetricLimit &other) const {
@@ -254,6 +262,10 @@ struct MDSPerfMetricLimit {
     denc(v.max_count, p);
     DENC_FINISH(p);
   }
+  void dump(ceph::Formatter *f) const {
+    f->dump_object("order_by", order_by);
+    f->dump_unsigned("max_count", max_count);
+  }
 };
 WRITE_CLASS_DENC(MDSPerfMetricLimit)
 
@@ -268,7 +280,7 @@ struct MDSPerfMetricQuery {
   }
   MDSPerfMetricQuery(const MDSPerfMetricKeyDescriptor &key_descriptor,
                      const MDSPerformanceCounterDescriptors &performance_counter_descriptors)
-    : key_descriptor(key_descriptor),
+      : key_descriptor(key_descriptor),
       performance_counter_descriptors(performance_counter_descriptors)
   {
   }
@@ -320,6 +332,11 @@ struct MDSPerfMetricQuery {
     DENC_FINISH(p);
   }
 
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("key_descriptor") << key_descriptor;
+    f->dump_stream("performance_counter_descriptors") << performance_counter_descriptors;
+  }
+
   void pack_counters(const PerformanceCounters &counters, ceph::buffer::list *bl) const;
 };
 WRITE_CLASS_DENC(MDSPerfMetricQuery)
@@ -332,7 +349,7 @@ struct MDSPerfCollector : PerfCollector {
   utime_t last_updated_mono;
 
   MDSPerfCollector(MetricQueryID query_id)
-    : PerfCollector(query_id) {
+      : PerfCollector(query_id) {
   }
 };
 
@@ -346,6 +363,15 @@ struct MDSPerfMetrics {
     denc(v.group_packed_performance_counters, p);
     DENC_FINISH(p);
   }
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("performance_counter_descriptors") << performance_counter_descriptors;
+    f->open_array_section("group_packed_performance_counters");
+    for (auto &i : group_packed_performance_counters) {
+      f->dump_stream("key") << i.first;
+      f->dump_stream("value") << i.second;
+    }
+    f->close_section();
+  }
 };
 
 struct MDSPerfMetricReport {
@@ -359,6 +385,24 @@ struct MDSPerfMetricReport {
     denc(v.rank_metrics_delayed, p);
     DENC_FINISH(p);
   }
+  void dump(ceph::Formatter *f) const {
+    f->open_array_section("reports");
+    for (auto &i : reports) {
+      f->open_object_section("query");
+      f->dump_object("query",i.first);
+      f->close_section();
+      f->open_object_section("metrics");
+      f->dump_object("metrics",i.second);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<MDSPerfMetricReport *> &o) {
+    o.push_back(new MDSPerfMetricReport);
+    o.push_back(new MDSPerfMetricReport);
+    o.back()->reports.emplace(MDSPerfMetricQuery(), MDSPerfMetrics());
+    o.back()->rank_metrics_delayed.insert(1);
+  }
 };
 
 WRITE_CLASS_DENC(MDSPerfMetrics)
diff --git a/src/mgr/MetricTypes.h b/src/mgr/MetricTypes.h
index 762564f37ee..9bfd3fa0e25 100644
--- a/src/mgr/MetricTypes.h
+++ b/src/mgr/MetricTypes.h
@@ -32,11 +32,19 @@ struct OSDMetricPayload {
   }
 
   void dump(ceph::Formatter *f) const {
-    encode_json("report", report, f);
+    f->open_array_section("report");
+    for (auto& i : report) {
+      f->open_object_section("query");
+      i.first.dump(f);
+      f->close_section();
+      f->open_object_section("report");
+      i.second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
   }
-
   static void generate_test_instances(std::list<OSDMetricPayload*>& ls) {
-    ls.push_back(new OSDMetricPayload);
+    ls.push_back(new OSDMetricPayload());
   }
 };
 
@@ -55,6 +63,12 @@ struct MDSMetricPayload {
     denc(v.metric_report, p);
     DENC_FINISH(p);
   }
+  void dump(ceph::Formatter *f) const {
+    metric_report.dump(f);
+  }
+  static void generate_test_instances(std::list<MDSMetricPayload*>& ls) {
+    ls.push_back(new MDSMetricPayload());
+  }
 };
 
 struct UnknownMetricPayload {
@@ -65,6 +79,10 @@ struct UnknownMetricPayload {
   DENC(UnknownMetricPayload, v, p) {
     ceph_abort();
   }
+
+  void dump(ceph::Formatter *f) const {
+    ceph_abort();
+  }
 };
 
 WRITE_CLASS_DENC(OSDMetricPayload)
@@ -145,6 +163,23 @@ struct MetricReportMessage {
 
   boost::apply_visitor(DecodeMetricPayloadVisitor(iter), payload);
   }
+  void dump(ceph::Formatter *f) const {
+    f->open_object_section("payload");
+    if (const OSDMetricPayload* osdPayload = boost::get<OSDMetricPayload>(&payload)) {
+      osdPayload->dump(f);
+    } else if (const MDSMetricPayload* mdsPayload = boost::get<MDSMetricPayload>(&payload)) {
+      mdsPayload->dump(f);
+    } else if (const UnknownMetricPayload* unknownPayload = boost::get<UnknownMetricPayload>(&payload)) {
+      unknownPayload->dump(f);
+    } else {
+      ceph_abort();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<MetricReportMessage*>& ls) {
+    ls.push_back(new MetricReportMessage(OSDMetricPayload()));
+    ls.push_back(new MetricReportMessage(MDSMetricPayload()));
+  }
 };
 
 WRITE_CLASS_ENCODER(MetricReportMessage);
@@ -188,6 +223,22 @@ struct MDSConfigPayload {
     denc(v.config, p);
     DENC_FINISH(p);
   }
+
+  void dump(ceph::Formatter *f) const {
+    f->open_object_section("config");
+    for (auto& i : config) {
+      f->dump_object("query", i.first);
+      f->open_object_section("limits");
+      for (auto& j : i.second) {
+        f->dump_object("limit", j);
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<MDSConfigPayload*>& ls) {
+    ls.push_back(new MDSConfigPayload);
+  }
 };
 
 struct UnknownConfigPayload {
diff --git a/src/mgr/OSDPerfMetricTypes.h b/src/mgr/OSDPerfMetricTypes.h
index 8dd0afd8cb4..aba27f284a4 100644
--- a/src/mgr/OSDPerfMetricTypes.h
+++ b/src/mgr/OSDPerfMetricTypes.h
@@ -7,6 +7,8 @@
 #include "common/ceph_json.h"
 #include "include/denc.h"
 #include "include/stringify.h"
+#include "common/Formatter.h"
+
 #include "mgr/Types.h"
 
 #include <regex>
@@ -70,7 +72,6 @@ struct OSDPerfMetricSubKeyDescriptor {
     denc(v.regex_str, p);
     DENC_FINISH(p);
   }
-
   void dump(ceph::Formatter *f) const {
     f->dump_unsigned("type", static_cast<uint8_t>(type));
     f->dump_string("regex", regex_str);
diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc
index 0eb304e7353..08501568a2c 100644
--- a/src/mgr/PyModuleRegistry.cc
+++ b/src/mgr/PyModuleRegistry.cc
@@ -151,7 +151,8 @@ bool PyModuleRegistry::handle_mgr_map(const MgrMap &mgr_map_)
     return false;
   } else {
     bool modules_changed = mgr_map_.modules != mgr_map.modules ||
-      mgr_map_.always_on_modules != mgr_map.always_on_modules;
+      mgr_map_.always_on_modules != mgr_map.always_on_modules ||
+      mgr_map_.force_disabled_modules != mgr_map.force_disabled_modules;
     mgr_map = mgr_map_;
 
     if (standby_modules != nullptr) {
@@ -240,10 +241,20 @@ void PyModuleRegistry::active_start(
     // Anything we're skipping because of !can_run will be flagged
     // to the user separately via get_health_checks
     if (!(i.second->is_enabled() && i.second->is_loaded())) {
+      dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
+	      << "not enabled and loaded"  << dendl;
       continue;
     }
 
-    dout(4) << "Starting " << i.first << dendl;
+    // These are always-on modules but user force-disabled them.
+    if (mgr_map.force_disabled_modules.find(i.first) !=
+	mgr_map.force_disabled_modules.end()) {
+      dout(8) << __func__ << " Not starting module '" << i.first << "', it is "
+	      << "force-disabled" << dendl;
+      continue;
+    }
+
+    dout(4) << "Starting module '" << i.first << "'" << dendl;
     active_modules->start_one(i.second);
   }
 }
diff --git a/src/mon/ConfigMap.cc b/src/mon/ConfigMap.cc
index 86528c1dedf..1444103f460 100644
--- a/src/mon/ConfigMap.cc
+++ b/src/mon/ConfigMap.cc
@@ -266,7 +266,7 @@ int ConfigMap::add_option(
     ldout(cct, 10) << __func__ << " unrecognized option '" << name << "'" << dendl;
     stray_options.push_back(
       std::unique_ptr<Option>(
-	new Option(name, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
+	new Option(std::string{name}, Option::TYPE_STR, Option::LEVEL_UNKNOWN)));
     opt = stray_options.back().get();
   }
 
diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc
index 62d37574ded..6220a357ff0 100644
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@@ -385,6 +385,17 @@ public:
       return -EINVAL;
     }
 
+    bool confirm = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+    if (var == "max_mds" && !confirm && mon->mdsmon()->has_any_health_warning()) {
+      ss << "One or more file system health warnings are present. Modifying "
+	 << "the file system setting variable \"max_mds\" may not help "
+	 << "troubleshoot or recover from these warnings and may further "
+	 << "destabilize the system. If you really wish to proceed, run "
+	 << "again with --yes-i-really-mean-it";
+      return -EPERM;
+    }
+
     return set_val(mon, fsmap, op, cmdmap, ss, fsp->get_fscid(), var, val);
   }
 };
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 76a57ac443d..d8cca4ceb61 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -1557,6 +1557,13 @@ bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
   return false;
 }
 
+bool MDSMonitor::has_any_health_warning()
+{
+  return std::any_of(
+    pending_daemon_health.begin(), pending_daemon_health.end(),
+    [](auto& it) { return !it.second.metrics.empty() ? true : false; });
+}
+
 int MDSMonitor::filesystem_command(
     FSMap &fsmap,
     MonOpRequestRef op,
diff --git a/src/mon/MDSMonitor.h b/src/mon/MDSMonitor.h
index b0f88cd3130..dd2a269009d 100644
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@@ -53,6 +53,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
   bool prepare_update(MonOpRequestRef op) override;
   bool should_propose(double& delay) override;
   bool has_health_warnings(std::vector<mds_metric_t> warnings);
+  bool has_any_health_warning();
 
   bool should_print_status() const {
     auto& fs = get_fsmap();
diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h
index 82f6ea88046..1ab542a871f 100644
--- a/src/mon/MgrMap.h
+++ b/src/mon/MgrMap.h
@@ -297,6 +297,9 @@ public:
   // active version.
   std::map<uint32_t, std::set<std::string>> always_on_modules;
 
+  // Modules which are always-on but have been force-disabled by user.
+  std::set<std::string> force_disabled_modules;
+
   // Modules which are reported to exist
   std::vector<ModuleInfo> available_modules;
 
@@ -448,7 +451,7 @@ public:
       ENCODE_FINISH(bl);
       return;
     }
-    ENCODE_START(13, 6, bl);
+    ENCODE_START(14, 6, bl);
     encode(epoch, bl);
     encode(active_addrs, bl, features);
     encode(active_gid, bl);
@@ -473,13 +476,14 @@ public:
     encode(clients_addrs, bl, features);
     encode(clients_names, bl, features);
     encode(flags, bl);
+    encode(force_disabled_modules, bl);
     ENCODE_FINISH(bl);
     return;
   }
 
   void decode(ceph::buffer::list::const_iterator& p)
   {
-    DECODE_START(13, p);
+    DECODE_START(14, p);
     decode(epoch, p);
     decode(active_addrs, p);
     decode(active_gid, p);
@@ -549,6 +553,11 @@ public:
     if (struct_v >= 13) {
       decode(flags, p);
     }
+
+    if (struct_v >= 14) {
+      decode(force_disabled_modules, p);
+    }
+
     DECODE_FINISH(p);
   }
 
@@ -603,6 +612,13 @@ public:
       f->close_section();
     }
     f->close_section(); // always_on_modules
+
+    f->open_object_section("force_disabled_modules");
+    for (auto& m : force_disabled_modules) {
+        f->dump_string("module", m);
+      }
+    f->close_section();
+
     f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
     f->open_array_section("active_clients");
     for (const auto& i : clients) {
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
index c235d9e6219..b89878dddb7 100644
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -146,10 +146,12 @@ void MgrMonitor::create_initial()
   }
   pending_map.always_on_modules = always_on_modules();
   pending_command_descs = mgr_commands;
-  dout(10) << __func__ << " initial modules " << pending_map.modules
-	   << ", always on modules " << pending_map.get_always_on_modules()
-           << ", " << pending_command_descs.size() << " commands"
+  dout(10) << __func__ << " initial enabled modules: " << pending_map.modules
 	   << dendl;
+  dout(10) << __func__ << "always on modules: " <<
+	     pending_map.get_always_on_modules() << dendl;
+  dout(10) << __func__ << "total " << pending_command_descs.size() <<
+	      " commands" << dendl;
 }
 
 void MgrMonitor::get_store_prefixes(std::set<string>& s) const
@@ -215,7 +217,7 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
       string name = string("mgr/") + i.name + "/" + j.second.name;
       auto p = mgr_module_options.emplace(
 	name,
-	Option(name, static_cast<Option::type_t>(j.second.type),
+	Option(std::string{name}, static_cast<Option::type_t>(j.second.type),
 	       static_cast<Option::level_t>(j.second.level)));
       Option& opt = p.first->second;
       opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
@@ -1019,6 +1021,13 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
           f->dump_string("module", p);
         }
         f->close_section();
+
+        f->open_array_section("force_disabled_modules");
+        for (auto& p : map.force_disabled_modules) {
+          f->dump_string("module", p);
+        }
+        f->close_section();
+
         f->open_array_section("enabled_modules");
         for (auto& p : map.modules) {
           if (map.get_always_on_modules().count(p) > 0)
@@ -1048,7 +1057,11 @@ bool MgrMonitor::preprocess_command(MonOpRequestRef op)
 
       for (auto& p : map.get_always_on_modules()) {
         tbl << p;
-        tbl << "on (always on)";
+	if (map.force_disabled_modules.find(p) == map.force_disabled_modules.end()) {
+	  tbl << "on (always on)";
+	} else  {
+	  tbl << "off (always on but force-disabled)";
+	}
         tbl << TextTable::endrow;
       }
       for (auto& p : map.modules) {
@@ -1269,10 +1282,13 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
       r = -EINVAL;
       goto out;
     }
-    if (pending_map.get_always_on_modules().count(module) > 0) {
+
+    if (pending_map.get_always_on_modules().count(module) > 0 &&
+        !pending_map.force_disabled_modules.contains(module)) {
       ss << "module '" << module << "' is already enabled (always-on)";
       goto out;
     }
+
     bool force = false;
     cmd_getval_compat_cephbool(cmdmap, "force", force);
     if (!pending_map.all_support_module(module) &&
@@ -1296,7 +1312,12 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
       ss << "module '" << module << "' is already enabled";
       r = 0;
       goto out;
+    } else if (pending_map.force_disabled_modules.contains(module)) {
+      pending_map.force_disabled_modules.erase(module);
+      r = 0;
+      goto out;
     }
+
     pending_map.modules.insert(module);
   } else if (prefix == "mgr module disable") {
     string module;
@@ -1306,8 +1327,9 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
       goto out;
     }
     if (pending_map.get_always_on_modules().count(module) > 0) {
-      ss << "module '" << module << "' cannot be disabled (always-on)";
-      r = -EINVAL;
+      ss << "module '" << module << "' cannot be disabled (always-on), use " <<
+	 "'ceph mgr module force disable' command to disable an always-on module";
+      r = -EPERM;
       goto out;
     }
     if (!pending_map.module_enabled(module)) {
@@ -1318,7 +1340,52 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
     if (!pending_map.modules.count(module)) {
       ss << "module '" << module << "' is not enabled";
     }
+    dout(8) << __func__ << " disabling module " << module << " from new " << dendl;
     pending_map.modules.erase(module);
+  } else if (prefix == "mgr module force disable") {
+    string mod;
+    cmd_getval(cmdmap, "module", mod);
+
+    bool confirmation_flag = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirmation_flag);
+
+    if (mod.empty()) {
+      ss << "Module name wasn't passed!";
+      r = -EINVAL;
+      goto out;
+    }
+
+    if (!pending_map.get_always_on_modules().contains(mod)) {
+      ss << "Always-on module named \"" << mod << "\" does not exist";
+      r = -EINVAL;
+      goto out;
+    } else if (pending_map.modules.contains(mod)) {
+      ss << "Module '" << mod << "' is not an always-on module, only always-on " <<
+	 "modules can be disabled through this command.";
+      r = -EINVAL;
+      goto out;
+    }
+
+    if (pending_map.force_disabled_modules.contains(mod)) {
+      ss << "Module \"" << mod << "\"is already disabled";
+      r = 0;
+      goto out;
+    }
+
+    if (!confirmation_flag) {
+      ss << "This command will disable operations and remove commands that "
+	 << "other Ceph utilities expect to be available. Do not continue "
+	 << "unless your cluster is already experiencing an event due to "
+	 << "which it is advised to disable this module as part of "
+	 << "troubleshooting. If you are sure that you wish to continue, "
+	 << "run again with --yes-i-really-mean-it";
+      r = -EPERM;
+      goto out;
+    }
+
+    dout(8) << __func__ << " force-disabling module '" << mod << "'" << dendl;
+    pending_map.force_disabled_modules.insert(mod);
+    pending_map.modules.erase(mod);
   } else {
     ss << "Command '" << prefix << "' not implemented!";
     r = -ENOSYS;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index b5de8837cb7..3cc3c8abd1e 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -1357,6 +1357,10 @@ COMMAND("mgr module enable "
 COMMAND("mgr module disable "
 	"name=module,type=CephString",
 	"disable mgr module", "mgr", "rw")
+COMMAND("mgr module force disable "
+	"name=module,type=CephString "
+	"name=yes_i_really_mean_it,type=CephBool,req=false",
+	"force disable a always-on mgr module", "mgr", "rw")
 COMMAND("mgr metadata name=who,type=CephString,req=false",
 	"dump metadata for all daemons or a specific daemon",
 	"mgr", "r")
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index d60d3edefd2..c01ea9e7103 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -99,6 +99,8 @@ int NVMeofGwMap::cfg_add_gw(
         return 0;
       }
     }
+  }
+  for (auto& itr: created_gws[group_key]) {
     if (itr.second.availability == gw_availability_t::GW_DELETING) {
       //Was found some GW in "Deleting" state. Just to inherit its ANA group
       NvmeGwMonState & gw_created = created_gws[group_key][itr.first];
@@ -166,6 +168,7 @@ int NVMeofGwMap::cfg_delete_gw(
         dout(4) << " Deleting  GW :"<< gw_id  << " in state  "
             << state.availability <<  " Resulting GW availability: "
             << state.availability  << dendl;
+        state.subsystems.clear();//ignore subsystems of this GW
         return 0;
       }
     }
@@ -217,10 +220,13 @@ int NVMeofGwMap::do_delete_gw(
 int NVMeofGwMap::get_num_namespaces(const NvmeGwId &gw_id,
     const NvmeGroupKey& group_key,  const BeaconSubsystems&  subs)
 {
-  auto grpid = created_gws[group_key][gw_id].ana_grp_id ;
+  auto grpid = created_gws[group_key][gw_id].ana_grp_id;
   int num_ns = 0;
-  for (auto & subs_it:subs) {
-    for (auto & ns :subs_it.namespaces) {
+  if (subs.size() == 0) {
+    dout(20) << "Empty subsystems  for GW " << gw_id  << dendl;
+  }
+  for (auto & subsystem:subs) {
+    for (auto & ns :subsystem.namespaces) {
       if (ns.anagrpid == (grpid+1)) {
          num_ns++;
       }
@@ -241,13 +247,14 @@ void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
         do_delete_gw(gw_id, group_key);
         propose_pending =  true;
       }
-      dout(4) << " to delete ? " << gw_id  << " num_ns " << num_ns << dendl;
+      dout(4) << " to delete ? " << gw_id  << " num_ns " << num_ns
+          << " subsystems size "<< subs.size() << dendl;
       break; // handle just one GW in "Deleting" state in time.
     }
   }
 }
 
-int NVMeofGwMap::process_gw_map_gw_no_subsystems(
+int NVMeofGwMap::process_gw_map_gw_no_subsys_no_listeners(
   const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
 {
   int rc = 0;
@@ -417,7 +424,6 @@ void NVMeofGwMap::find_failback_gw(
   auto& gws_states = created_gws[group_key];
   auto& gw_state = created_gws[group_key][gw_id];
   bool do_failback = false;
-
   dout(10) << "Find failback GW for GW " << gw_id << dendl;
   for (auto& gw_state_it: gws_states) {
     auto& st = gw_state_it.second;
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 29710371742..267d85b10f9 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -54,7 +54,7 @@ public:
   int process_gw_map_gw_down(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
-  int process_gw_map_gw_no_subsystems(
+  int process_gw_map_gw_no_subsys_no_listeners(
     const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
     bool &propose_pending);
   void update_active_timers(bool &propose_pending);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 544ad674722..d9e936e27df 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -115,18 +115,21 @@ void NVMeofGwMon::tick()
       _propose_pending |= propose;
       last_beacon.erase(lb);
     } else {
-      BeaconSubsystems  *subsystems =
-         &pending_map.created_gws[lb.group_key][lb.gw_id].subsystems;
-      if (subsystems && subsystems->size() && old_group_key != lb.group_key) {
-        // to call track_deleting_gws once per each group-key
-        pending_map.track_deleting_gws(lb.group_key, *subsystems, propose);
-        old_group_key = lb.group_key;
-        _propose_pending |= propose;
-      }
       dout(20) << "beacon live for GW key: " << lb.gw_id << dendl;
     }
   }
-
+  BeaconSubsystems empty_subsystems;
+  for (auto &[group_key, gws_states]: pending_map.created_gws) {
+    BeaconSubsystems *subsystems = &empty_subsystems;
+    for (auto& gw_state : gws_states) { // loop for GWs inside nqn group
+      subsystems = &gw_state.second.subsystems;
+      if (subsystems->size()) { // Set subsystems to the valid value
+        break;
+      }
+    }
+    pending_map.track_deleting_gws(group_key, *subsystems, propose);
+    _propose_pending |= propose;
+  }
   // Periodic: take care of not handled ANA groups
   pending_map.handle_abandoned_ana_groups(propose);
   _propose_pending |= propose;
@@ -364,6 +367,13 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
 	std::stringstream  sstrm1;
 	sstrm1 << state.availability;
 	f->dump_string("Availability", sstrm1.str());
+	uint32_t num_listeners = 0;
+	if (state.availability == gw_availability_t::GW_AVAILABLE) {
+	  for (auto &subs: state.subsystems) {
+	    num_listeners += subs.listeners.size();
+	  }
+	  f->dump_unsigned("num-listeners", num_listeners);
+	}
 	sstrm1.str("");
 	for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
 	  sstrm1 << " " << state_itr.first + 1 << ": "
@@ -473,7 +483,7 @@ void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
     if (avail == gw_availability_t::GW_UNAVAILABLE) {
       pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
     } else {
-      pending_map.process_gw_map_gw_no_subsystems(gw_id, group_key, propose_pending);
+      pending_map.process_gw_map_gw_no_subsys_no_listeners(gw_id, group_key, propose_pending);
     }
 
   }
@@ -567,10 +577,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 
   // At this stage the gw has to be in the Created_gws
   if (gw == group_gws.end()) {
-    dout(4) << "Administratively deleted GW sends beacon " << gw_id << dendl;
+    dout(4) << "GW that does not appear in the map sends beacon, ignore "
+       << gw_id << dendl;
+    mon.no_reply(op);
+    goto false_return; // not sending ack to this beacon
+  }
+  if (pending_map.created_gws[group_key][gw_id].availability ==
+    gw_availability_t::GW_DELETING) {
+    dout(4) << "GW sends beacon in DELETING state, ignore "
+       << gw_id << dendl;
+    mon.no_reply(op);
     goto false_return; // not sending ack to this beacon
   }
-
   // deep copy the whole nonce map of this GW
   if (m->get_nonce_map().size()) {
     if (pending_map.created_gws[group_key][gw_id].nonce_map !=
@@ -589,7 +607,18 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
 
   if (sub.size() == 0) {
     avail = gw_availability_t::GW_CREATED;
-  }
+  } else {
+    bool listener_found = false;
+    for (auto &subs: sub) {
+      if (subs.listeners.size()) {
+        listener_found = true;
+        break;
+      }
+    }
+    if (!listener_found) {
+     avail = gw_availability_t::GW_CREATED;
+    }
+  }// for HA no-subsystems and no-listeners are same usecases
   if (pending_map.created_gws[group_key][gw_id].subsystems != sub) {
     dout(10) << "subsystems of GW changed, propose pending " << gw_id << dendl;
     pending_map.created_gws[group_key][gw_id].subsystems =  sub;
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 67d169570ff..3b5ded25aef 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -3352,9 +3352,13 @@ void PGMap::get_health_checks(
       // application metadata is not encoded until luminous is minimum
       // required release
       if (pool.application_metadata.empty() && !pool.is_tier()) {
-        stringstream ss;
-        ss << "application not enabled on pool '" << pool_name << "'";
-        detail.push_back(ss.str());
+        utime_t now(ceph::real_clock::now());
+        if ((now - pool.get_create_time()) >
+            g_conf().get_val<std::chrono::seconds>("mon_warn_on_pool_no_app_grace").count()) {
+          stringstream ss;
+          ss << "application not enabled on pool '" << pool_name << "'";
+          detail.push_back(ss.str());
+        }
       }
     }
     if (!detail.empty()) {
diff --git a/src/mrgw.sh b/src/mrgw.sh
index 05739bf015e..86bef336867 100755
--- a/src/mrgw.sh
+++ b/src/mrgw.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Start/restart a radosgw instance on the given mstart.sh cluster.
+
 set -e
 
 rgw_frontend=${RGW_FRONTEND:-"beast"}
diff --git a/src/mrun b/src/mrun
index a8522180021..df7e3542b93 100755
--- a/src/mrun
+++ b/src/mrun
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Run a ceph command against the given mstart.sh cluster.
+
 [ $# -lt 2 ] && echo "usage: $0 <name> <command> [params...]" && exit 1
 
 root=`dirname $0`
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 3e5c58ec376..bb67ff3eef5 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -262,8 +262,8 @@ public:
 #endif
 
 protected:
-  ceph_msg_header  header;      // headerelope
-  ceph_msg_footer  footer;
+  ceph_msg_header  header{};      // headerelope
+  ceph_msg_footer  footer{};
   ceph::buffer::list       payload;  // "front" unaligned blob
   ceph::buffer::list       middle;   // "middle" unaligned blob
   ceph::buffer::list       data;     // data payload (page-alignment will be preserved where possible)
@@ -332,16 +332,11 @@ protected:
   friend class Messenger;
 
 public:
-  Message() {
-    memset(&header, 0, sizeof(header));
-    memset(&footer, 0, sizeof(footer));
-  }
+  Message() = default;
   Message(int t, int version=1, int compat_version=0) {
-    memset(&header, 0, sizeof(header));
     header.type = t;
     header.version = version;
     header.compat_version = compat_version;
-    memset(&footer, 0, sizeof(footer));
   }
 
   Message *get() {
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
index 683be086efa..ab3d454748e 100644
--- a/src/msg/async/AsyncConnection.cc
+++ b/src/msg/async/AsyncConnection.cc
@@ -310,7 +310,7 @@ ssize_t AsyncConnection::write(ceph::buffer::list &bl,
     outgoing_bl.claim_append(bl);
     ssize_t r = _try_send(more);
     if (r > 0) {
-      writeCallback = callback;
+      writeCallback = std::move(callback);
     }
     return r;
 }
@@ -621,7 +621,7 @@ void AsyncConnection::fault()
 }
 
 void AsyncConnection::_stop() {
-  writeCallback.reset();
+  writeCallback = {};
   dispatch_queue->discard_queue(conn_id);
   async_msgr->unregister_conn(this);
   worker->release_worker();
@@ -737,8 +737,7 @@ void AsyncConnection::handle_write_callback() {
   recv_start_time = ceph::mono_clock::now();
   write_lock.lock();
   if (writeCallback) {
-    auto callback = *writeCallback;
-    writeCallback.reset();
+    auto callback = std::move(writeCallback);
     write_lock.unlock();
     callback(0);
     return;
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
index 78a590f8ca3..a4f18e2c4fb 100644
--- a/src/msg/async/AsyncConnection.h
+++ b/src/msg/async/AsyncConnection.h
@@ -223,7 +223,7 @@ private:
 
   std::unique_ptr<Protocol> protocol;
 
-  std::optional<std::function<void(ssize_t)>> writeCallback;
+  std::function<void(ssize_t)> writeCallback;
   std::function<void(char *, ssize_t)> readCallback;
   std::optional<unsigned> pendingReadLen;
   char *read_buffer;
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
index a595667e447..6acd6275738 100644
--- a/src/msg/async/Event.h
+++ b/src/msg/async/Event.h
@@ -97,11 +97,7 @@ class EventCenter {
   using clock_type = ceph::coarse_mono_clock;
 
   struct AssociatedCenters {
-    EventCenter *centers[MAX_EVENTCENTER];
-    AssociatedCenters() {
-      // FIPS zeroization audit 20191115: this memset is not security related.
-      memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*));
-    }
+    EventCenter *centers[MAX_EVENTCENTER]{};
   };
 
   struct FileEvent {
diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc
index b14de7b1e56..a53f6389c31 100644
--- a/src/msg/async/ProtocolV1.cc
+++ b/src/msg/async/ProtocolV1.cc
@@ -90,9 +90,8 @@ void ProtocolV1::connect() {
 
   // reset connect state variables
   authorizer_buf.clear();
-  // FIPS zeroization audit 20191115: these memsets are not security related.
-  memset(&connect_msg, 0, sizeof(connect_msg));
-  memset(&connect_reply, 0, sizeof(connect_reply));
+  connect_msg = {};
+  connect_reply = {};
 
   global_seq = messenger->get_global_seq();
 }
@@ -820,7 +819,7 @@ CtPtr ProtocolV1::read_message_data_prepare() {
 #if 0
     // rx_buffers is broken by design... see
     //  http://tracker.ceph.com/issues/22480
-    map<ceph_tid_t, pair<ceph::buffer::list, int> >::iterator p =
+    const auto p =
         connection->rx_buffers.find(current_header.tid);
     if (p != connection->rx_buffers.end()) {
       ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second
@@ -1205,7 +1204,7 @@ void ProtocolV1::requeue_sent() {
     return;
   }
 
-  list<out_q_entry_t> &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  auto &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
   out_seq -= sent.size();
   while (!sent.empty()) {
     Message *m = sent.back();
@@ -1220,10 +1219,11 @@ void ProtocolV1::requeue_sent() {
 uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
   ldout(cct, 10) << __func__ << " " << seq << dendl;
   std::lock_guard<std::mutex> l(connection->write_lock);
-  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+  const auto it = out_q.find(CEPH_MSG_PRIO_HIGHEST);
+  if (it == out_q.end()) {
     return seq;
   }
-  list<out_q_entry_t> &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  auto &rq = it->second;
   uint64_t count = out_seq;
   while (!rq.empty()) {
     Message* const m = rq.front().m;
@@ -1235,7 +1235,7 @@ uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
     rq.pop_front();
     count++;
   }
-  if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+  if (rq.empty()) out_q.erase(it);
   return count;
 }
 
@@ -1246,18 +1246,16 @@ uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
 void ProtocolV1::discard_out_queue() {
   ldout(cct, 10) << __func__ << " started" << dendl;
 
-  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
-    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
-    (*p)->put();
+  for (Message *msg : sent) {
+    ldout(cct, 20) << __func__ << " discard " << msg << dendl;
+    msg->put();
   }
   sent.clear();
-  for (map<int, list<out_q_entry_t>>::iterator p =
-           out_q.begin();
-       p != out_q.end(); ++p) {
-    for (list<out_q_entry_t>::iterator r = p->second.begin();
-         r != p->second.end(); ++r) {
-      ldout(cct, 20) << __func__ << " discard " << r->m << dendl;
-      r->m->put();
+  for (auto& [ prio, entries ] : out_q) {
+    static_cast<void>(prio);
+    for (auto& entry : entries) {
+      ldout(cct, 20) << __func__ << " discard " << entry.m << dendl;
+      entry.m->put();
     }
   }
   out_q.clear();
@@ -1296,7 +1294,7 @@ void ProtocolV1::reset_recv_state()
 
   // clean read and write callbacks
   connection->pendingReadLen.reset();
-  connection->writeCallback.reset();
+  connection->writeCallback = {};
 
   if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH &&
       connection->policy.throttler_messages) {
@@ -1328,14 +1326,12 @@ void ProtocolV1::reset_recv_state()
 
 ProtocolV1::out_q_entry_t ProtocolV1::_get_next_outgoing() {
   out_q_entry_t out_entry;
-  if (!out_q.empty()) {
-    map<int, list<out_q_entry_t>>::reverse_iterator it =
-        out_q.rbegin();
+  if (const auto it = out_q.begin(); it != out_q.end()) {
     ceph_assert(!it->second.empty());
-    list<out_q_entry_t>::iterator p = it->second.begin();
+    const auto p = it->second.begin();
     out_entry = *p;
     it->second.erase(p);
-    if (it->second.empty()) out_q.erase(it->first);
+    if (it->second.empty()) out_q.erase(it);
   }
   return out_entry;
 }
@@ -1572,8 +1568,7 @@ CtPtr ProtocolV1::handle_connect_message_write(int r) {
 CtPtr ProtocolV1::wait_connect_reply() {
   ldout(cct, 20) << __func__ << dendl;
 
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&connect_reply, 0, sizeof(connect_reply));
+  connect_reply = {};
   return READ(sizeof(connect_reply), handle_connect_reply_1);
 }
 
@@ -1923,8 +1918,7 @@ CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) {
 CtPtr ProtocolV1::wait_connect_message() {
   ldout(cct, 20) << __func__ << dendl;
 
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&connect_msg, 0, sizeof(connect_msg));
+  connect_msg = {};
   return READ(sizeof(connect_msg), handle_connect_message_1);
 }
 
@@ -1988,8 +1982,7 @@ CtPtr ProtocolV1::handle_connect_message_2() {
   ceph_msg_connect_reply reply;
   ceph::buffer::list authorizer_reply;
 
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&reply, 0, sizeof(reply));
+  reply = {};
   reply.protocol_version =
       messenger->get_proto_version(connection->peer_type, false);
 
@@ -2616,8 +2609,7 @@ CtPtr ProtocolV1::server_ready() {
 		 << dendl;
 
   ldout(cct, 20) << __func__ << " accept done" << dendl;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  memset(&connect_msg, 0, sizeof(connect_msg));
+  connect_msg = {};
 
   if (connection->delay_state) {
     ceph_assert(connection->delay_state->ready());
diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h
index 1b7c1d2b5f8..63bc1cd0946 100644
--- a/src/msg/async/ProtocolV1.h
+++ b/src/msg/async/ProtocolV1.h
@@ -112,7 +112,12 @@ protected:
     bool is_prepared {false};
   };
   // priority queue for outbound msgs
-  std::map<int, std::list<out_q_entry_t>> out_q;
+
+  /**
+   * A queue for each priority value, highest priority first.
+   */
+  std::map<int, std::list<out_q_entry_t>, std::greater<int>> out_q;
+
   bool keepalive;
   bool write_in_progress = false;
 
diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc
index 6d44d6c783f..ed6f93cdd48 100644
--- a/src/msg/async/ProtocolV2.cc
+++ b/src/msg/async/ProtocolV2.cc
@@ -127,9 +127,9 @@ bool ProtocolV2::is_connected() { return can_write; }
 void ProtocolV2::discard_out_queue() {
   ldout(cct, 10) << __func__ << " started" << dendl;
 
-  for (auto p = sent.begin(); p != sent.end(); ++p) {
-    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
-    (*p)->put();
+  for (Message *msg : sent) {
+    ldout(cct, 20) << __func__ << " discard " << msg << dendl;
+    msg->put();
   }
   sent.clear();
   for (auto& [ prio, entries ] : out_queue) {
@@ -211,10 +211,11 @@ void ProtocolV2::requeue_sent() {
 uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
   ldout(cct, 10) << __func__ << " " << seq << dendl;
   std::lock_guard<std::mutex> l(connection->write_lock);
-  if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+  const auto it = out_queue.find(CEPH_MSG_PRIO_HIGHEST);
+  if (it == out_queue.end()) {
     return seq;
   }
-  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  auto& rq = it->second;
   uint64_t count = out_seq;
   while (!rq.empty()) {
     Message* const m = rq.front().m;
@@ -226,7 +227,7 @@ uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
     rq.pop_front();
     count++;
   }
-  if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST);
+  if (rq.empty()) out_queue.erase(it);
   return count;
 }
 
@@ -265,7 +266,7 @@ void ProtocolV2::reset_recv_state() {
 
   // clean read and write callbacks
   connection->pendingReadLen.reset();
-  connection->writeCallback.reset();
+  connection->writeCallback = {};
 
   next_tag = static_cast<Tag>(0);
 
@@ -507,14 +508,13 @@ void ProtocolV2::read_event() {
 ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() {
   out_queue_entry_t out_entry;
 
-  if (!out_queue.empty()) {
-    auto it = out_queue.rbegin();
+  if (const auto it = out_queue.begin(); it != out_queue.end()) {
     auto& entries = it->second;
     ceph_assert(!entries.empty());
     out_entry = entries.front();
     entries.pop_front();
     if (entries.empty()) {
-      out_queue.erase(it->first);
+      out_queue.erase(it);
     }
   }
   return out_entry;
@@ -796,7 +796,7 @@ CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
 }
 
 template <class F>
-CtPtr ProtocolV2::write(const std::string &desc,
+CtPtr ProtocolV2::write(std::string_view desc,
                         CONTINUATION_TYPE<ProtocolV2> &next,
                         F &frame) {
   ceph::bufferlist bl;
@@ -812,7 +812,7 @@ CtPtr ProtocolV2::write(const std::string &desc,
   return write(desc, next, bl);
 }
 
-CtPtr ProtocolV2::write(const std::string &desc,
+CtPtr ProtocolV2::write(std::string_view desc,
                         CONTINUATION_TYPE<ProtocolV2> &next,
                         ceph::bufferlist &buffer) {
   if (unlikely(pre_auth.enabled)) {
diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h
index 6441866fea4..1ee258c4975 100644
--- a/src/msg/async/ProtocolV2.h
+++ b/src/msg/async/ProtocolV2.h
@@ -93,7 +93,12 @@ private:
     bool is_prepared {false};
     Message* m {nullptr};
   };
-  std::map<int, std::list<out_queue_entry_t>> out_queue;
+
+  /**
+   * A queue for each priority value, highest priority first.
+   */
+  std::map<int, std::list<out_queue_entry_t>, std::greater<int>> out_queue;
+
   std::list<Message *> sent;
   std::atomic<uint64_t> out_seq{0};
   std::atomic<uint64_t> in_seq{0};
@@ -130,10 +135,10 @@ private:
   Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
                        rx_buffer_t&& buffer);
   template <class F>
-  Ct<ProtocolV2> *write(const std::string &desc,
+  Ct<ProtocolV2> *write(std::string_view desc,
                         CONTINUATION_TYPE<ProtocolV2> &next,
 			F &frame);
-  Ct<ProtocolV2> *write(const std::string &desc,
+  Ct<ProtocolV2> *write(std::string_view desc,
                         CONTINUATION_TYPE<ProtocolV2> &next,
                         ceph::bufferlist &buffer);
 
diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc
index ef4a6ddabfb..a9b03c74d4d 100644
--- a/src/msg/async/frames_v2.cc
+++ b/src/msg/async/frames_v2.cc
@@ -63,9 +63,7 @@ static bool check_epilogue_late_status(__u8 late_status) {
 
 void FrameAssembler::fill_preamble(Tag tag,
                                    preamble_block_t& preamble) const {
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&preamble, 0, sizeof(preamble));
-
+  preamble = {};
   preamble.tag = static_cast<__u8>(tag);
   for (size_t i = 0; i < m_descs.size(); i++) {
     preamble.segments[i].length = m_descs[i].logical_len;
@@ -100,9 +98,7 @@ uint64_t FrameAssembler::get_frame_onwire_len() const {
 
 bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble,
                                         bufferlist segment_bls[]) const {
-  epilogue_crc_rev0_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_crc_rev0_block_t epilogue{};
 
   bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue));
   frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
@@ -123,9 +119,7 @@ bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
   preamble_bl.append(reinterpret_cast<const char*>(&preamble),
                      sizeof(preamble));
 
-  epilogue_secure_rev0_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_secure_rev0_block_t epilogue{};
   bufferlist epilogue_bl(sizeof(epilogue));
   epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
                      sizeof(epilogue));
@@ -151,9 +145,7 @@ bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
 
 bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble,
                                         bufferlist segment_bls[]) const {
-  epilogue_crc_rev1_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_crc_rev1_block_t epilogue{};
   epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
 
   bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue));
@@ -215,9 +207,7 @@ bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble,
     return frame_bl;  // no epilogue if only one segment
   }
 
-  epilogue_secure_rev1_block_t epilogue;
-  // FIPS zeroization audit 20191115: this memset is not security related.
-  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue_secure_rev1_block_t epilogue{};
   epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
   bufferlist epilogue_bl(sizeof(epilogue));
   epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
diff --git a/src/mstart.sh b/src/mstart.sh
index 34b57e17611..0c512ca9eb8 100755
--- a/src/mstart.sh
+++ b/src/mstart.sh
@@ -1,5 +1,33 @@
 #!/bin/sh
 
+# Deploy a vstart.sh cluster in a named subdirectory. This makes it possible to
+# start multiple clusters in different subdirectories. See mstop.sh for cleanup.
+#
+# Example:
+#
+# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c1 -n -d
+# ~/ceph/build $ MON=1 OSD=1 RGW=1 MDS=0 MGR=0 ../src/mstart.sh c2 -n -d
+#
+# ~/ceph/build $ ls run
+# c1  c2
+# ~/ceph/build $ ls run/c1
+# asok  ceph.conf  dev  keyring  out
+#
+# ~/ceph/build $ ../src/mrun c1 radosgw-admin user list
+# [
+#     "56789abcdef0123456789abcdef0123456789abcdef0123456789abcdef01234",
+#     "testx$9876543210abcdef0123456789abcdef0123456789abcdef0123456789abcdef",
+#     "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef",
+#     "testacct1user",
+#     "test",
+#     "testacct2root",
+#     "testacct1root",
+#     "testid"
+# ]
+#
+# ~/ceph/build $ ../src/mstop.sh c1
+# ~/ceph/build $ ../src/mstop.sh c2
+
 usage="usage: $0 <name> [vstart options]..\n"
 
 usage_exit() {
diff --git a/src/mstop.sh b/src/mstop.sh
index 702d1765941..eec0ca02e42 100755
--- a/src/mstop.sh
+++ b/src/mstop.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# Stop a named cluster started by mstart.sh
+
 set -e
 
 script_root=`dirname $0`
diff --git a/src/mypy-constrains.txt b/src/mypy-constrains.txt
index 7810870804e..0a79b8ef4f1 100644
--- a/src/mypy-constrains.txt
+++ b/src/mypy-constrains.txt
@@ -2,7 +2,7 @@
 # Unfortunately this means we have to manually update those 
 # packages regularly. 
 
-mypy==1.1.1
+mypy==1.9
 
 # global
 types-python-dateutil==0.1.3
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 3dcd96830c4..5f4f1a4d48a 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -3760,15 +3760,16 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
 {
   auto t0 = mono_clock::now();
   std::lock_guard hl(h->lock);
+  auto& fnode = h->file->fnode;
   dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
-           << " file " << h->file->fnode << dendl;
+           << " file " << fnode << dendl;
   if (h->file->deleted) {
     dout(10) << __func__ << "  deleted, no-op" << dendl;
     return 0;
   }
 
   // we never truncate internal log files
-  ceph_assert(h->file->fnode.ino > 1);
+  ceph_assert(fnode.ino > 1);
 
   // truncate off unflushed data?
   if (h->pos < offset &&
@@ -3782,20 +3783,58 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
     if (r < 0)
       return r;
   }
-  if (offset == h->file->fnode.size) {
-    return 0;  // no-op!
-  }
-  if (offset > h->file->fnode.size) {
+  if (offset > fnode.size) {
     ceph_abort_msg("truncate up not supported");
   }
-  ceph_assert(h->file->fnode.size >= offset);
+  ceph_assert(offset <= fnode.size);
   _flush_bdev(h);
-
-  std::lock_guard ll(log.lock);
-  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset);
-  h->file->fnode.size = offset;
-  h->file->is_dirty = true;
-  log.t.op_file_update_inc(h->file->fnode);
+  {
+    std::lock_guard ll(log.lock);
+    std::lock_guard dl(dirty.lock);
+    bool changed_extents = false;
+    vselector->sub_usage(h->file->vselector_hint, fnode);
+    uint64_t x_off = 0;
+    auto p = fnode.seek(offset, &x_off);
+    uint64_t cut_off =
+      (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
+    uint64_t new_allocated;
+    if (0 == cut_off) {
+      // whole pextent to remove
+      changed_extents = true;
+      new_allocated = offset;
+    } else if (cut_off < p->length) {
+      dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
+      new_allocated = (offset - x_off) + cut_off;
+      p->length = cut_off;
+      changed_extents = true;
+      ++p;
+    } else {
+      ceph_assert(cut_off >= p->length);
+      new_allocated  = (offset - x_off) + p->length;
+      // just leave it here
+      ++p;
+    }
+    while (p != fnode.extents.end()) {
+      dirty.pending_release[p->bdev].insert(p->offset, p->length);
+      p = fnode.extents.erase(p);
+      changed_extents = true;
+    }
+    if (changed_extents) {
+      fnode.size = offset;
+      fnode.allocated = new_allocated;
+      fnode.reset_delta();
+      log.t.op_file_update(fnode);
+      // sad, but is_dirty must be set to signal flushing of the log
+      h->file->is_dirty = true;
+    } else {
+      if (offset != fnode.size) {
+        fnode.size = offset;
+        //skipping log.t.op_file_update_inc, it will be done by flush()
+        h->file->is_dirty = true;
+      }
+    }
+    vselector->add_usage(h->file->vselector_hint, fnode);
+  }
   logger->tinc(l_bluefs_truncate_lat, mono_clock::now() - t0);
   return 0;
 }
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
index 68040af4282..7cbe0a1d121 100644
--- a/src/os/bluestore/BlueRocksEnv.cc
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -221,18 +221,12 @@ class BlueRocksWritableFile : public rocksdb::WritableFile {
   }
 
   rocksdb::Status Close() override {
-    fs->fsync(h);
 
-    // mimic posix env, here.  shrug.
-    size_t block_size;
-    size_t last_allocated_block;
-    GetPreallocationStatus(&block_size, &last_allocated_block);
-    if (last_allocated_block > 0) {
-      int r = fs->truncate(h, h->pos);
-      if (r < 0)
-	return err_to_status(r);
+    int r = fs->truncate(h, h->pos);
+    if (r < 0) {
+      return err_to_status(r);
     }
-
+    fs->fsync(h);
     return rocksdb::Status::OK();
   }
 
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 44a171873c0..535cf166f0a 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -6794,9 +6794,8 @@ void BlueStore::_main_bdev_label_try_reserve()
   vector<uint64_t> candidate_positions;
   vector<uint64_t> accepted_positions;
   uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
-  for (size_t i = 1; i < bdev_label_positions.size(); i++) {
-    uint64_t location = bdev_label_positions[i];
-    if (location + lsize <= bdev->get_size()) {
+  for (uint64_t location : bdev_label_valid_locations) {
+    if (location != BDEV_FIRST_LABEL_POSITION) {
       candidate_positions.push_back(location);
     }
   }
@@ -11497,9 +11496,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
     string p = path + "/block";
     _write_bdev_label(cct, bdev, p, bdev_label, bdev_labels_in_repair);
     for (uint64_t pos : bdev_labels_in_repair) {
-      if (pos != BDEV_FIRST_LABEL_POSITION) {
-        bdev_label_valid_locations.push_back(pos);
-      }
+      bdev_label_valid_locations.push_back(pos);
     }
     repaired += bdev_labels_in_repair.size();
   }
@@ -20572,6 +20569,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
     if (ret < 0) {
       return ret;
     }
+    if (bdev_label_multi) {
+      uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
+      for (uint64_t p : bdev_label_valid_locations) {
+	if (p != BDEV_FIRST_LABEL_POSITION) {
+	  allocator->init_rm_free(p, lsize);
+	}
+      }
+    }
 
     duration = ceph_clock_now() - start;
     stats.insert_count = 0;
diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc
index c24bf2161aa..d62721b4366 100644
--- a/src/os/bluestore/bluestore_tool.cc
+++ b/src/os/bluestore/bluestore_tool.cc
@@ -746,20 +746,25 @@ int main(int argc, char **argv)
   else if (action == "show-label") {
     JSONFormatter jf(true);
     jf.open_object_section("devices");
+    bool any_success = false;
     for (auto& i : devs) {
+      jf.open_object_section(i.c_str());
       bluestore_bdev_label_t label;
       int r = BlueStore::read_bdev_label(cct.get(), i, &label);
       if (r < 0) {
-	cerr << "unable to read label for " << i << ": "
-	     << cpp_strerror(r) << std::endl;
-	exit(EXIT_FAILURE);
+        cerr << "unable to read label for " << i << ": "
+             << cpp_strerror(r) << std::endl;
+      } else {
+        any_success = true;
+        label.dump(&jf);
       }
-      jf.open_object_section(i.c_str());
-      label.dump(&jf);
       jf.close_section();
     }
     jf.close_section();
     jf.flush(cout);
+    if (!any_success) {
+      exit(EXIT_FAILURE);
+    }
   }
   else if (action == "set-label-key") {
     bluestore_bdev_label_t label;
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc
index 02bb04c4a0a..1fc87610502 100644
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -327,15 +327,14 @@ void ECCommon::ReadPipeline::get_min_want_to_read_shards(
 {
   const auto [left_chunk_index, right_chunk_index] =
     sinfo.offset_length_to_data_chunk_indices(offset, length);
-  for(uint64_t i = left_chunk_index; i < right_chunk_index; i++) {
-    auto raw_chunk = i % sinfo.get_data_chunk_count();
+  const auto distance =
+    std::min(right_chunk_index - left_chunk_index,
+             sinfo.get_data_chunk_count());
+  for(uint64_t i = 0; i < distance; i++) {
+    auto raw_chunk = (left_chunk_index + i) % sinfo.get_data_chunk_count();
     auto chunk = chunk_mapping.size() > raw_chunk ?
       chunk_mapping[raw_chunk] : static_cast<int>(raw_chunk);
-    if (auto [_, inserted] = want_to_read->insert(chunk); !inserted) {
-      // aready processed all chunks
-      ceph_assert(want_to_read->size() == sinfo.get_data_chunk_count());
-      break;
-    }
+    want_to_read->insert(chunk);
   }
 }
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index fb3a415a542..ce46bb245ea 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -3930,11 +3930,6 @@ int OSD::init()
 
   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
 
-  if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
-    dout(2) << "compacting object store's DB" << dendl;
-    store->compact();
-  }
-
   // prime osd stats
   {
     struct store_statfs_t stbuf;
@@ -4080,6 +4075,11 @@ int OSD::init()
   if (is_stopping())
     return 0;
 
+  if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
+    dout(2) << "compacting object store's DB" << dendl;
+    store->compact();
+  }
+
   // start objecter *after* we have authenticated, so that we don't ignore
   // the OSDMaps it requests.
   service.final_init();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 76256df49b8..71b9b713385 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1137,46 +1137,10 @@ void PG::update_snap_map(
   const vector<pg_log_entry_t> &log_entries,
   ObjectStore::Transaction &t)
 {
-  for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
+  for (const auto& entry : log_entries) {
     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-    if (i->soid.snap < CEPH_MAXSNAP) {
-      if (i->is_delete()) {
-	int r = snap_mapper.remove_oid(
-	  i->soid,
-	  &_t);
-	if (r)
-	  derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
-        // On removal tolerate missing key corruption
-        ceph_assert(r == 0 || r == -ENOENT);
-      } else if (i->is_update()) {
-	ceph_assert(i->snaps.length() > 0);
-	vector<snapid_t> snaps;
-	bufferlist snapbl = i->snaps;
-	auto p = snapbl.cbegin();
-	try {
-	  decode(snaps, p);
-	} catch (...) {
-	  derr << __func__ << " decode snaps failure on " << *i << dendl;
-	  snaps.clear();
-	}
-	set<snapid_t> _snaps(snaps.begin(), snaps.end());
-
-	if (i->is_clone() || i->is_promote()) {
-	  snap_mapper.add_oid(
-	    i->soid,
-	    _snaps,
-	    &_t);
-	} else if (i->is_modify()) {
-	  int r = snap_mapper.update_snaps(
-	    i->soid,
-	    _snaps,
-	    0,
-	    &_t);
-	  ceph_assert(r == 0);
-	} else {
-	  ceph_assert(i->is_clean());
-	}
-      }
+    if (entry.soid.snap < CEPH_MAXSNAP) {
+      snap_mapper.update_snap_map(entry, &_t);
     }
   }
 }
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index 22222b7f7af..8d768ec4a66 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -3033,7 +3033,9 @@ void PeeringState::proc_primary_info(
   ceph_assert(!is_primary());
 
   update_history(oinfo.history);
-  if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
+  bool has_scrub_error = (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors);
+  info.stats = oinfo.stats;
+  if (has_scrub_error) {
     info.stats.stats.sum.num_scrub_errors = 0;
     info.stats.stats.sum.num_shallow_scrub_errors = 0;
     info.stats.stats.sum.num_deep_scrub_errors = 0;
diff --git a/src/osd/osd_types_fmt.h b/src/osd/osd_types_fmt.h
index 04f4d46ee51..100ce6e4646 100644
--- a/src/osd/osd_types_fmt.h
+++ b/src/osd/osd_types_fmt.h
@@ -392,4 +392,6 @@ inline std::ostream &operator<<(std::ostream &lhs, const object_stat_sum_t &sum)
 
 #if FMT_VERSION >= 90000
 template <bool TrackChanges> struct fmt::formatter<pg_missing_set<TrackChanges>> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<pool_opts_t> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<store_statfs_t> : fmt::ostream_formatter {};
 #endif
diff --git a/src/osd/scrubber/ScrubStore.cc b/src/osd/scrubber/ScrubStore.cc
index a00ab2caece..7f28ca2d642 100644
--- a/src/osd/scrubber/ScrubStore.cc
+++ b/src/osd/scrubber/ScrubStore.cc
@@ -1,11 +1,13 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
 
-#include "ScrubStore.h"
+#include "./ScrubStore.h"
 #include "osd/osd_types.h"
 #include "common/scrub_types.h"
 #include "include/rados/rados_types.hpp"
 
+#include "pg_scrubber.h"
+
 using std::ostringstream;
 using std::string;
 using std::vector;
@@ -13,21 +15,9 @@ using std::vector;
 using ceph::bufferlist;
 
 namespace {
-ghobject_t make_scrub_object(const spg_t& pgid)
-{
-  ostringstream ss;
-  ss << "scrub_" << pgid;
-  return pgid.make_temp_ghobject(ss.str());
-}
-
 string first_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -47,12 +37,7 @@ string to_object_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_object_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", CEPH_NOSNAP, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_OBJ_" + hoid.to_str();
 }
@@ -60,14 +45,9 @@ string last_object_key(int64_t pool)
 string first_snap_key(int64_t pool)
 {
   // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
-  // the representing the minimal and maximum keys. and this relies on how
+  // representing the minimal and maximum keys. and this relies on how
   // hobject_t::to_str() works: hex(pool).hex(revhash).
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0x00000000,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0x00000000, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
@@ -86,123 +66,447 @@ string to_snap_key(int64_t pool, const librados::object_id_t& oid)
 
 string last_snap_key(int64_t pool)
 {
-  auto hoid = hobject_t(object_t(),
-			"",
-			0,
-			0xffffffff,
-			pool,
-			"");
+  auto hoid = hobject_t(object_t(), "", 0, 0xffffffff, pool, "");
   hoid.build_hash_cache();
   return "SCRUB_SS_" + hoid.to_str();
 }
+
+}  // namespace
+
+#undef dout_context
+#define dout_context (m_scrubber.get_pg_cct())
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix_fn(_dout, this, __func__)
+
+template <class T>
+static std::ostream& _prefix_fn(std::ostream* _dout, T* t, std::string fn = "")
+{
+  return t->gen_prefix(*_dout, fn);
 }
 
 namespace Scrub {
 
-Store*
-Store::create(ObjectStore* store,
-	      ObjectStore::Transaction* t,
-	      const spg_t& pgid,
-	      const coll_t& coll)
+Store::Store(
+    PgScrubber& scrubber,
+    ObjectStore& osd_store,
+    ObjectStore::Transaction* t,
+    const spg_t& pgid,
+    const coll_t& coll)
+    : m_scrubber{scrubber}
+    , object_store{osd_store}
+    , coll{coll}
 {
-  ceph_assert(store);
   ceph_assert(t);
-  ghobject_t oid = make_scrub_object(pgid);
-  t->touch(coll, oid);
-  return new Store{coll, oid, store};
+
+  // shallow errors DB object
+  const auto sh_err_obj =
+      pgid.make_temp_ghobject(fmt::format("scrub_{}", pgid));
+  t->touch(coll, sh_err_obj);
+  shallow_db.emplace(
+      pgid, sh_err_obj, OSDriver{&object_store, coll, sh_err_obj});
+
+  // and the DB for deep errors
+  const auto dp_err_obj =
+      pgid.make_temp_ghobject(fmt::format("deep_scrub_{}", pgid));
+  t->touch(coll, dp_err_obj);
+  deep_db.emplace(pgid, dp_err_obj, OSDriver{&object_store, coll, dp_err_obj});
+
+  dout(20) << fmt::format(
+		  "created Scrub::Store for pg[{}], shallow: {}, deep: {}",
+		  pgid, sh_err_obj, dp_err_obj)
+	   << dendl;
 }
 
-Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
-  : coll(coll),
-    hoid(oid),
-    driver(store, coll, hoid),
-    backend(&driver)
-{}
 
 Store::~Store()
 {
-  ceph_assert(results.empty());
+  ceph_assert(!shallow_db || shallow_db->results.empty());
+  ceph_assert(!deep_db || deep_db->results.empty());
 }
 
+
+std::ostream& Store::gen_prefix(std::ostream& out, std::string_view fn) const
+{
+  if (fn.starts_with("operator")) {
+    // it's a lambda, and __func__ is not available
+    return m_scrubber.gen_prefix(out) << "Store::";
+  } else {
+    return m_scrubber.gen_prefix(out) << "Store::" << fn << ": ";
+  }
+}
+
+
 void Store::add_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
   add_object_error(pool, e);
 }
 
+
 void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  results[to_object_key(pool, e.object)] = bl;
+  using librados::obj_err_t;
+  const auto key = to_object_key(pool, e.object);
+  dout(20) << fmt::format(
+		  "{}: adding error for object {} ({}). Errors: {} ({}/{}) "
+		  "unfiltered:{}",
+		  (current_level == scrub_level_t::deep ? "deep" : "shallow"),
+		  e.object, key, obj_err_t{e.errors},
+		  obj_err_t{e.errors & obj_err_t::SHALLOW_ERRORS},
+		  obj_err_t{e.errors & obj_err_t::DEEP_ERRORS}, e)
+	   << dendl;
+
+  if (current_level == scrub_level_t::deep) {
+    // not overriding the deep errors DB during shallow scrubs
+    deep_db->results[key] = e.encode();
+  }
+
+  // only shallow errors are stored in the shallow DB
+  auto e_copy = e;
+  e_copy.errors &= librados::obj_err_t::SHALLOW_ERRORS;
+  e_copy.union_shards.errors &= librados::err_t::SHALLOW_ERRORS;
+  shallow_db->results[key] = e_copy.encode();
 }
 
+
 void Store::add_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
   add_snap_error(pool, e);
 }
 
+
 void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
 {
-  bufferlist bl;
-  e.encode(bl);
-  results[to_snap_key(pool, e.object)] = bl;
+  // note: snap errors are only placed in the shallow store
+  shallow_db->results[to_snap_key(pool, e.object)] = e.encode();
 }
 
-bool Store::empty() const
+
+bool Store::is_empty() const
 {
-  return results.empty();
+  return (!shallow_db || shallow_db->results.empty()) &&
+	 (!deep_db || deep_db->results.empty());
 }
 
+
 void Store::flush(ObjectStore::Transaction* t)
 {
   if (t) {
-    OSDriver::OSTransaction txn = driver.get_transaction(t);
-    backend.set_keys(results, &txn);
+    auto txn = shallow_db->driver.get_transaction(t);
+    shallow_db->backend.set_keys(shallow_db->results, &txn);
+    txn = deep_db->driver.get_transaction(t);
+    deep_db->backend.set_keys(deep_db->results, &txn);
+  }
+
+  shallow_db->results.clear();
+  deep_db->results.clear();
+}
+
+
+void Store::clear_level_db(
+    ObjectStore::Transaction* t,
+    at_level_t& db,
+    std::string_view db_name)
+{
+  dout(20) << fmt::format("removing (omap) entries for {} error DB", db_name)
+	   << dendl;
+  // easiest way to guarantee that the object representing the DB exists
+  t->touch(coll, db.errors_hoid);
+
+  // remove all the keys in the DB
+  t->omap_clear(coll, db.errors_hoid);
+
+  // restart the 'in progress' part of the MapCacher
+  db.backend.reset();
+}
+
+
+void Store::reinit(
+    ObjectStore::Transaction* t,
+    scrub_level_t level)
+{
+  // Note: only one caller, and it creates the transaction passed to reinit().
+  // No need to assert on 't'
+  dout(20) << fmt::format(
+		  "re-initializing the Scrub::Store (for {} scrub)",
+		  (level == scrub_level_t::deep ? "deep" : "shallow"))
+	   << dendl;
+
+  current_level = level;
+
+  // always clear the known shallow errors DB (as both shallow and deep scrubs
+  // would recreate it)
+  if (shallow_db) {
+    clear_level_db(t, *shallow_db, "shallow");
+  }
+  // only a deep scrub recreates the deep errors DB
+  if (level == scrub_level_t::deep && deep_db) {
+    clear_level_db(t, *deep_db, "deep");
   }
-  results.clear();
 }
 
+
 void Store::cleanup(ObjectStore::Transaction* t)
 {
-  t->remove(coll, hoid);
+  dout(20) << "discarding error DBs" << dendl;
+  ceph_assert(t);
+  if (shallow_db)
+    t->remove(coll, shallow_db->errors_hoid);
+  if (deep_db)
+    t->remove(coll, deep_db->errors_hoid);
 }
 
-std::vector<bufferlist>
-Store::get_snap_errors(int64_t pool,
-		       const librados::object_id_t& start,
-		       uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_snap_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_snap_key(pool) : to_snap_key(pool, start));
+  vector<bufferlist> errors;
+  const string begin =
+      (start.name.empty() ? first_snap_key(pool) : to_snap_key(pool, start));
   const string end = last_snap_key(pool);
-  return get_errors(begin, end, max_return);
+
+  // the snap errors are stored only in the shallow store
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(begin);
+
+  while (max_return-- && latest_sh.has_value() && latest_sh->last_key < end) {
+    errors.push_back(latest_sh->data);
+    latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+  }
+
+  return errors;
 }
 
-std::vector<bufferlist>
-Store::get_object_errors(int64_t pool,
-			 const librados::object_id_t& start,
-			 uint64_t max_return) const
+
+std::vector<bufferlist> Store::get_object_errors(
+    int64_t pool,
+    const librados::object_id_t& start,
+    uint64_t max_return) const
 {
-  const string begin = (start.name.empty() ?
-			first_object_key(pool) : to_object_key(pool, start));
+  const string begin =
+      (start.name.empty() ? first_object_key(pool)
+			  : to_object_key(pool, start));
   const string end = last_object_key(pool);
+  dout(20) << fmt::format("fetching errors, from {} to {}", begin, end)
+	   << dendl;
   return get_errors(begin, end, max_return);
 }
 
-std::vector<bufferlist>
-Store::get_errors(const string& begin,
-		  const string& end,
-		  uint64_t max_return) const
+
+inline void decode(
+    librados::inconsistent_obj_t& obj,
+    ceph::buffer::list::const_iterator& bp)
 {
+  reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
+}
+
+
+inconsistent_obj_wrapper decode_wrapper(
+    hobject_t obj,
+    ceph::buffer::list::const_iterator bp)
+{
+  inconsistent_obj_wrapper iow{obj};
+  iow.decode(bp);
+  return iow;
+}
+
+
+void Store::collect_specific_store(
+    MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+    Store::ExpCacherPosData& latest,
+    std::vector<bufferlist>& errors,
+    std::string_view end_key,
+    uint64_t max_return) const
+{
+  while (max_return-- && latest.has_value() &&
+	 latest.value().last_key < end_key) {
+    errors.push_back(latest->data);
+    latest = backend.get_1st_after_key(latest->last_key);
+  }
+}
+
+
+/*
+ * Implementation notes:
+ * - see https://github.com/ceph/ceph/commit/df3ff6dafeadb3822b35c424a890db9a14d7f60f
+ *   for why we encode the shard_info_t in the store.
+ * - to maintain known shard_info-s created during a deep scrub (but only when
+ *   needed), we use our knowledge of the level of the last scrub performed
+ *   (current_level), and the object user version as encoded in the error
+ *   structure.
+ */
+bufferlist Store::merge_encoded_error_wrappers(
+    hobject_t obj,
+    ExpCacherPosData& latest_sh,
+    ExpCacherPosData& latest_dp) const
+{
+  // decode both error wrappers
+  auto sh_wrap = decode_wrapper(obj, latest_sh->data.cbegin());
+  auto dp_wrap = decode_wrapper(obj, latest_dp->data.cbegin());
+
+  // note: the '20' level is just until we're sure the merging works as
+  // expected
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    dout(20) << fmt::format(
+		    "merging errors {}. Deep: {:#x}-({})", sh_wrap.object,
+		    dp_wrap.errors, dp_wrap)
+	     << dendl;
+    dout(20) << fmt::format(
+		    "merging errors {}. Shallow: {:#x}-({})", sh_wrap.object,
+		    sh_wrap.errors, sh_wrap)
+	     << dendl;
+    // dev: list the attributes:
+    for (const auto& [shard, si] : sh_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" shallow: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+    for (const auto& [shard, si] : dp_wrap.shards) {
+      for (const auto& [attr, bl] : si.attrs) {
+	dout(20) << fmt::format(" deep: shard {} attr: {}", shard, attr)
+		 << dendl;
+      }
+    }
+  }
+
+  // Actual merging of the shard map entries is only performed if the
+  // latest version is from the shallow scrub.
+  // Otherwise, the deep scrub, which (for the shards info) contains all data,
+  // and the shallow scrub is ignored.
+  if (current_level == scrub_level_t::shallow) {
+    // is the object data related to the same object version?
+    if (sh_wrap.version == dp_wrap.version) {
+      // combine the error information
+      dp_wrap.errors |= sh_wrap.errors;
+      for (const auto& [shard, si] : sh_wrap.shards) {
+	if (dp_wrap.shards.contains(shard)) {
+	  dout(20) << fmt::format(
+			  "-----> {}-{}  combining: sh-errors: {} dp-errors:{}",
+			  sh_wrap.object, shard, si, dp_wrap.shards[shard])
+		   << dendl;
+	  const auto saved_er = dp_wrap.shards[shard].errors;
+	  dp_wrap.shards[shard].selected_oi = si.selected_oi;
+	  dp_wrap.shards[shard].primary = si.primary;
+	  dp_wrap.shards[shard].errors |= saved_er;
+
+	  // the attributes:
+	  for (const auto& [attr, bl] : si.attrs) {
+	    if (!dp_wrap.shards[shard].attrs.contains(attr)) {
+	      dout(20) << fmt::format(
+			      "-----> {}-{}  copying shallow attr: attr: {}",
+			      sh_wrap.object, shard, attr)
+		       << dendl;
+	      dp_wrap.shards[shard].attrs[attr] = bl;
+	    }
+	    // otherwise - we'll ignore the shallow attr buffer
+	  }
+	} else {
+	  // the deep scrub data for this shard is missing. We take the shallow
+	  // scrub data.
+	  dp_wrap.shards[shard] = si;
+	}
+      }
+    } else if (sh_wrap.version > dp_wrap.version) {
+	if (false && dp_wrap.version == 0) {
+	  // there was a read error in the deep scrub. The deep version
+	  // shows as '0'. That's severe enough for us to ignore the shallow.
+	  dout(10) << fmt::format("{} ignoring deep after read failure",
+			  sh_wrap.object)
+		   << dendl;
+	} else {
+	  // There is a new shallow version of the object results.
+	  // The deep data is for an older version of that object.
+	  // There are multiple possibilities here, but for now we ignore the
+	  // deep data.
+	  dp_wrap = sh_wrap;
+	}
+      }
+  }
+
+  return dp_wrap.encode();
+}
+
+
+// a better way to implement get_errors(): use two generators, one for each store.
+// and sort-merge the results. Almost like a merge-sort, but with equal
+// keys combined. 'todo' once 'ranges' are really working.
+
+std::vector<bufferlist> Store::get_errors(
+    const std::string& from_key,
+    const std::string& end_key,
+    uint64_t max_return) const
+{
+  // merge the input from the two sorted DBs into 'errors' (until
+  // enough errors are collected)
   vector<bufferlist> errors;
-  auto next = std::make_pair(begin, bufferlist{});
-  while (max_return && !backend.get_next(next.first, &next)) {
-    if (next.first >= end)
+  dout(20) << fmt::format("getting errors from {} to {}", from_key, end_key)
+	   << dendl;
+
+  ceph_assert(shallow_db);
+  ceph_assert(deep_db);
+  ExpCacherPosData latest_sh = shallow_db->backend.get_1st_after_key(from_key);
+  ExpCacherPosData latest_dp = deep_db->backend.get_1st_after_key(from_key);
+
+  while (max_return) {
+    dout(20) << fmt::format(
+		    "n:{} latest_sh: {}, latest_dp: {}", max_return,
+		    (latest_sh ? latest_sh->last_key : "(none)"),
+		    (latest_dp ? latest_dp->last_key : "(none)"))
+	     << dendl;
+
+    // keys not smaller than end_key are not interesting
+    if (latest_sh.has_value() && latest_sh->last_key >= end_key) {
+      latest_sh = tl::unexpected(-EINVAL);
+    }
+    if (latest_dp.has_value() && latest_dp->last_key >= end_key) {
+      latest_dp = tl::unexpected(-EINVAL);
+    }
+
+    if (!latest_sh && !latest_dp) {
+      // both stores are exhausted
+      break;
+    }
+    if (!latest_sh.has_value()) {
+      // continue with the deep store
+      dout(10) << fmt::format("collecting from deep store") << dendl;
+      collect_specific_store(
+	  deep_db->backend, latest_dp, errors, end_key, max_return);
       break;
-    errors.push_back(next.second);
+    }
+    if (!latest_dp.has_value()) {
+      // continue with the shallow store
+      dout(10) << fmt::format("collecting from shallow store") << dendl;
+      collect_specific_store(
+	  shallow_db->backend, latest_sh, errors, end_key, max_return);
+      break;
+    }
+
+    // we have results from both stores. Select the one with a lower key.
+    // If the keys are equal, combine the errors.
+    if (latest_sh->last_key == latest_dp->last_key) {
+      auto bl = merge_encoded_error_wrappers(
+	  shallow_db->errors_hoid.hobj, latest_sh, latest_dp);
+      errors.push_back(bl);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+
+    } else if (latest_sh->last_key < latest_dp->last_key) {
+      dout(20) << fmt::format("shallow store element ({})", latest_sh->last_key)
+	       << dendl;
+      errors.push_back(latest_sh->data);
+      latest_sh = shallow_db->backend.get_1st_after_key(latest_sh->last_key);
+    } else {
+      dout(20) << fmt::format("deep store element ({})", latest_dp->last_key)
+	       << dendl;
+      errors.push_back(latest_dp->data);
+      latest_dp = deep_db->backend.get_1st_after_key(latest_dp->last_key);
+    }
     max_return--;
   }
+
+  dout(10) << fmt::format("{} errors reported", errors.size()) << dendl;
   return errors;
 }
-
-} // namespace Scrub
+}  // namespace Scrub
diff --git a/src/osd/scrubber/ScrubStore.h b/src/osd/scrubber/ScrubStore.h
index 567badf608b..0955654d78e 100644
--- a/src/osd/scrubber/ScrubStore.h
+++ b/src/osd/scrubber/ScrubStore.h
@@ -1,10 +1,9 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-
-#ifndef CEPH_SCRUB_RESULT_H
-#define CEPH_SCRUB_RESULT_H
+#pragma once
 
 #include "common/map_cacher.hpp"
+#include "osd/osd_types_fmt.h"
 #include "osd/SnapMapper.h"  // for OSDriver
 
 namespace librados {
@@ -13,27 +12,71 @@ struct object_id_t;
 
 struct inconsistent_obj_wrapper;
 struct inconsistent_snapset_wrapper;
+class PgScrubber;
 
 namespace Scrub {
 
+/**
+ * Storing errors detected during scrubbing.
+ *
+ * From both functional and internal perspectives, the store is a pair of key-value
+ * databases: one maps objects to shallow errors detected during their scrubbing,
+ * and other stores deep errors.
+ * Note that the first store is updated in both shallow and in deep scrubs. The
+ * second - only while deep scrubbing.
+ *
+ * The DBs can be consulted by the operator, when trying to list 'errors known
+ * at this point in time'. Whenever a scrub starts - the relevant entries in the
+ * DBs are removed. Specifically - the shallow errors DB is recreated each scrub,
+ * while the deep errors DB is recreated only when a deep scrub starts.
+ *
+ * When queried - the data from both DBs is merged for each named object, and
+ * returned to the operator.
+ *
+ * Implementation:
+ * Each of the two DBs is implemented as OMAP entries of a single, uniquely named,
+ * object. Both DBs are cached using the general KV Cache mechanism.
+ */
+
 class Store {
  public:
   ~Store();
-  static Store* create(ObjectStore* store,
-		       ObjectStore::Transaction* t,
-		       const spg_t& pgid,
-		       const coll_t& coll);
+
+  Store(
+      PgScrubber& scrubber,
+      ObjectStore& osd_store,
+      ObjectStore::Transaction* t,
+      const spg_t& pgid,
+      const coll_t& coll);
+
+
+  /// mark down detected errors, either shallow or deep
   void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+
   void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
   // and a variant-friendly interface:
   void add_error(int64_t pool, const inconsistent_obj_wrapper& e);
   void add_error(int64_t pool, const inconsistent_snapset_wrapper& e);
 
-  bool empty() const;
+  [[nodiscard]] bool is_empty() const;
   void flush(ObjectStore::Transaction*);
+
+  /// remove both shallow and deep errors DBs. Called on interval.
   void cleanup(ObjectStore::Transaction*);
 
+  /**
+   * prepare the Store object for a new scrub session.
+   * This involves clearing one or both of the errors DBs, and resetting
+   * the cache.
+   *
+   * @param level: the scrub level to prepare for. Whenever a deep scrub
+   * is requested, both the shallow and deep errors DBs are cleared.
+   * If, on the other hand, a shallow scrub is requested, only the shallow
+   * errors DB is cleared.
+   */
+  void reinit(ObjectStore::Transaction* t, scrub_level_t level);
+
   std::vector<ceph::buffer::list> get_snap_errors(
     int64_t pool,
     const librados::object_id_t& start,
@@ -44,20 +87,89 @@ class Store {
     const librados::object_id_t& start,
     uint64_t max_return) const;
 
+  std::ostream& gen_prefix(std::ostream& out, std::string_view fn) const;
+
  private:
-  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
-  std::vector<ceph::buffer::list> get_errors(const std::string& start,
-					     const std::string& end,
-					     uint64_t max_return) const;
- private:
+  /**
+   * at_level_t
+   *
+   * The machinery for caching and storing errors at a specific scrub level.
+   */
+  struct at_level_t {
+    at_level_t(const spg_t& pgid, const ghobject_t& err_obj, OSDriver&& drvr)
+	: errors_hoid{err_obj}
+	, driver{std::move(drvr)}
+	, backend{&driver}
+    {}
+
+    /// the object in the PG store, where the errors are stored
+    ghobject_t errors_hoid;
+
+    /// abstracted key fetching
+    OSDriver driver;
+
+    /// a K,V cache for the errors that are detected during the scrub
+    /// session. The errors marked for a specific object are stored as
+    /// an OMap entry with the object's name as the key.
+    MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+
+    /// a temp object mapping seq-id to inconsistencies
+    std::map<std::string, ceph::buffer::list> results;
+  };
+
+  using CacherPosData =
+      MapCacher::MapCacher<std::string, ceph::buffer::list>::PosAndData;
+  using ExpCacherPosData = tl::expected<CacherPosData, int>;
+
+  /// access to the owning Scrubber object, for logging mostly
+  PgScrubber& m_scrubber;
+
+  /// the OSD's storage backend
+  ObjectStore& object_store;
+
+  /// the collection (i.e. - the PG store) in which the errors are stored
   const coll_t coll;
-  const ghobject_t hoid;
-  // a temp object holding mappings from seq-id to inconsistencies found in
-  // scrubbing
-  OSDriver driver;
-  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
-  std::map<std::string, ceph::buffer::list> results;
+
+  scrub_level_t current_level;
+
+  /**
+   * the machinery (backend details, cache, etc.) for storing both levels
+   * of errors (note: 'optional' to allow delayed creation w/o dynamic
+   * allocations; and 'mutable', as the caching mechanism is used in const
+   * methods)
+   */
+  mutable std::optional<at_level_t> shallow_db;
+  mutable std::optional<at_level_t> deep_db;
+
+  std::vector<ceph::buffer::list> get_errors(
+      const std::string& start,
+      const std::string& end,
+      uint64_t max_return) const;
+
+  void collect_specific_store(
+      MapCacher::MapCacher<std::string, ceph::buffer::list>& backend,
+      ExpCacherPosData& latest,
+      std::vector<bufferlist>& errors,
+      std::string_view end_key,
+      uint64_t max_return) const;
+
+  /**
+   * Clear the DB of errors at a specific scrub level by performing an
+   * omap_clear() on the DB object, and resetting the MapCacher.
+   */
+  void clear_level_db(
+      ObjectStore::Transaction* t,
+      at_level_t& db,
+      std::string_view db_name);
+
+  /**
+   * merge the two error wrappers - fetched from both DBs for the same object.
+   * Specifically, the object errors are or'ed, and so are the per-shard
+   * entries.
+   */
+  bufferlist merge_encoded_error_wrappers(
+      hobject_t obj,
+      ExpCacherPosData& latest_sh,
+      ExpCacherPosData& latest_dp) const;
 };
 }  // namespace Scrub
-
-#endif	// CEPH_SCRUB_RESULT_H
diff --git a/src/osd/scrubber/osd_scrub.cc b/src/osd/scrubber/osd_scrub.cc
index c67d2fca5fc..c8cf27d2116 100644
--- a/src/osd/scrubber/osd_scrub.cc
+++ b/src/osd/scrubber/osd_scrub.cc
@@ -220,8 +220,6 @@ Scrub::OSDRestrictions OsdScrub::restrictions_on_scrubbing(
     env_conditions.restricted_time = !scrub_time_permit(scrub_clock_now);
     env_conditions.cpu_overloaded =
 	!m_load_tracker.scrub_load_below_threshold();
-    env_conditions.only_deadlined =
-	env_conditions.restricted_time || env_conditions.cpu_overloaded;
   }
 
   return env_conditions;
diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc
index 8ff0d1ff7d8..cd80625aaec 100644
--- a/src/osd/scrubber/osd_scrub_sched.cc
+++ b/src/osd/scrubber/osd_scrub_sched.cc
@@ -86,8 +86,6 @@ std::optional<Scrub::SchedEntry> ScrubQueue::pop_ready_entry(
     OSDRestrictions restrictions,
     utime_t time_now)
 {
-  /// \todo must handle 'only_deadlined'!
-
   auto eligible_filtr = [&, rst = restrictions](
 				  const SchedEntry& e) -> bool {
       return eligibility_pred(e, rst, time_now);
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 555d13ba72b..c37f31d28dc 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1183,6 +1183,7 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica,
   m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
 }
 
+// only called on interval change. Both DBs are to be removed.
 void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
 {
   if (!m_store)
@@ -1200,6 +1201,38 @@ void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
   ceph_assert(!m_store);
 }
 
+
+void PgScrubber::reinit_scrub_store()
+{
+  // Entering, 0 to 3 of the following objects(*) may exist:
+  // ((*)'objects' here: both code objects (the ScrubStore object) and
+  // actual Object Store objects).
+  // 1. The ScrubStore object itself.
+  // 2,3. The two special hobjects in the coll (the PG data) holding the last
+  //      scrub's results.
+  //
+  // The Store object can be deleted and recreated, as a way to guarantee
+  // no junk is left. We won't do it here, but we will clear the at_level_t
+  // structures.
+  // The hobjects: possibly. The shallow DB object is always cleared. The
+  // deep one - only if running a deep scrub.
+  ObjectStore::Transaction t;
+  if (m_store) {
+    dout(10) << __func__ << " reusing existing store" << dendl;
+    m_store->flush(&t);
+  } else {
+    dout(10) << __func__ << " creating new store" << dendl;
+    m_store = std::make_unique<Scrub::Store>(
+	*this, *m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll);
+  }
+
+  // regardless of whether the ScrubStore object was recreated or reused, we need to
+  // (possibly) clear the actual DB objects in the Object Store.
+  m_store->reinit(&t, m_active_target->level());
+  m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+}
+
+
 void PgScrubber::on_init()
 {
   // going upwards from 'inactive'
@@ -1217,14 +1250,8 @@ void PgScrubber::on_init()
     m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow,
     m_pg->get_actingset());
 
-  //  create a new store
-  {
-    ObjectStore::Transaction t;
-    cleanup_store(&t);
-    m_store.reset(
-      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
-    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
-  }
+  // create or reuse the 'known errors' store
+  reinit_scrub_store();
 
   m_start = m_pg->info.pgid.pgid.get_hobj_start();
   m_active = true;
@@ -2660,9 +2687,53 @@ void PgScrubber::log_cluster_warning(const std::string& warning) const
   m_osds->clog->do_log(CLOG_WARN, warning);
 }
 
-ostream& PgScrubber::show(ostream& out) const
+
+ostream& PgScrubber::show_concise(ostream& out) const
 {
-  return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
+  /*
+  * 'show_concise()' is only used when calling operator<< thru the ScrubPgIF,
+  * i.e. only by the PG when creating a standard log entry.
+  *
+  * desired outcome (only relevant for Primaries):
+  *
+  * if scrubbing:
+  *   (urgency,flags)
+  *   or (if blocked)
+  *   (*blocked*,urgency,flags)
+  *
+  * if not scrubbing:
+  *   either nothing (if only periodic scrubs are scheduled)
+  *   or [next-scrub: effective-lvl, urgency]
+  */
+  if (!is_primary()) {
+    return out;
+  }
+
+  if (m_active) {
+    const auto flags_txt = fmt::format("{}", m_flags);
+    const std::string sep = (flags_txt.empty() ? "" : ",");
+    if (m_active_target) {
+      return out << fmt::format(
+		 "({}{}{}{})", (m_scrub_job->blocked ? "*blocked*," : ""),
+		 m_active_target->urgency(), sep, flags_txt);
+    } else {
+      // only expected in a couple of messages during scrub termination
+      return out << fmt::format(
+		 "(teardown{}{}{})", (m_scrub_job->blocked ? "-*blocked*" : ""),
+		 sep, flags_txt);
+    }
+  }
+
+  // not actively scrubbing now. Show some info about the next scrub
+  const auto now_is = ceph_clock_now();
+  const auto& next_scrub = m_scrub_job->earliest_target(now_is);
+  if (!next_scrub.is_high_priority()) {
+    // no interesting flags to report
+    return out;
+  }
+  return out << fmt::format(
+	     "[next-scrub:{},{:10.10}]", (next_scrub.is_deep() ? "dp" : "sh"),
+	     next_scrub.urgency());
 }
 
 int PgScrubber::asok_debug(std::string_view cmd,
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
index ff8c98d387e..3d7e16cd359 100644
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -164,7 +164,7 @@ template <>
 struct formatter<scrub_flags_t> {
   constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
   template <typename FormatContext>
-  auto format(scrub_flags_t& sf, FormatContext& ctx) const
+  auto format(const scrub_flags_t& sf, FormatContext& ctx) const
   {
     std::string txt;
     bool sep{false};
@@ -528,7 +528,7 @@ class PgScrubber : public ScrubPgIF,
   /// to complete (in order to perform an 'after-repair' scrub)
   bool m_after_repair_scrub_required{false};
 
-  ostream& show(ostream& out) const override;
+  ostream& show_concise(ostream& out) const override;
 
  public:
   //  ------------------  the I/F used by the ScrubBackend (ScrubBeListener)
@@ -741,6 +741,12 @@ class PgScrubber : public ScrubPgIF,
   bool m_publish_sessions{false};  //< will the counter be part of 'query'
 				   //output?
 
+  /**
+   * the scrub operation flags.
+   * Set at scrub start. Checked in multiple locations - mostly
+   * at finish.
+   * Note: replicas only use the 'priority' field.
+   */
   scrub_flags_t m_flags;
 
   bool m_active{false};
@@ -771,6 +777,16 @@ class PgScrubber : public ScrubPgIF,
 
   std::unique_ptr<Scrub::Store> m_store;
 
+  /**
+   * the ScrubStore sub-object caches and manages the database of known
+   * scrub errors. reinit_scrub_store() clears the database and re-initializes
+   * the ScrubStore object.
+   *
+   * in the next iteration - reinit_..() potentially deletes only the
+   * shallow errors part of the database.
+   */
+  void reinit_scrub_store();
+
   int num_digest_updates_pending{0};
   hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
 
diff --git a/src/osd/scrubber/scrub_queue_entry.h b/src/osd/scrubber/scrub_queue_entry.h
index 03d959769b2..aeb76c104fe 100644
--- a/src/osd/scrubber/scrub_queue_entry.h
+++ b/src/osd/scrubber/scrub_queue_entry.h
@@ -98,11 +98,6 @@ static inline std::weak_ordering cmp_ripe_entries(
   if (auto cmp = r.urgency <=> l.urgency; cmp != 0) {
     return cmp;
   }
-  // if we are comparing the two targets of the same PG, once both are
-  // ripe - the 'deep' scrub is considered 'higher' than the 'shallow' one.
-  if (l.pgid == r.pgid && r.level < l.level) {
-    return std::weak_ordering::less;
-  }
   // the 'utime_t' operator<=> is 'partial_ordering', it seems.
   if (auto cmp = std::weak_order(
 	  double(l.schedule.scheduled_at), double(r.schedule.scheduled_at));
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index d1a0fbdccb5..809107e593b 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -92,10 +92,10 @@ struct OSDRestrictions {
   /// the OSD is performing recovery & osd_repair_during_recovery is 'true'
   bool allow_requested_repair_only:1{false};
 
-  /// the load is high, or the time is not right. For periodic scrubs,
-  /// only the overdue ones are allowed.
-  bool only_deadlined:1{false};
+  /// the CPU load is high. No regular scrubs are allowed.
   bool cpu_overloaded:1{false};
+
+  /// outside of allowed scrubbing hours/days
   bool restricted_time:1{false};
 
   /// the OSD is performing a recovery, osd_scrub_during_recovery is 'false',
@@ -299,12 +299,11 @@ struct ScrubPgIF {
 
   virtual ~ScrubPgIF() = default;
 
-  friend std::ostream& operator<<(std::ostream& out, const ScrubPgIF& s)
-  {
-    return s.show(out);
+  friend std::ostream& operator<<(std::ostream& out, const ScrubPgIF& s) {
+    return s.show_concise(out);
   }
 
-  virtual std::ostream& show(std::ostream& out) const = 0;
+  virtual std::ostream& show_concise(std::ostream& out) const = 0;
 
   // --------------- triggering state-machine events:
 
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
index 5e1677de7c0..d15862c08ba 100644
--- a/src/osdc/Journaler.h
+++ b/src/osdc/Journaler.h
@@ -529,43 +529,35 @@ public:
   // ===================
 
   Header get_last_committed() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_committed;
   }
   Header get_last_written() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return last_written;
   }
 
   uint64_t get_layout_period() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout.get_period();
   }
   file_layout_t get_layout() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return layout;
   }
   bool is_active() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_ACTIVE;
   }
   bool is_stopping() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return state == STATE_STOPPING;
   }
   int get_error() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return error;
   }
   bool is_readonly() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return readonly;
   }
@@ -573,32 +565,26 @@ public:
   bool _is_readable();
   bool try_read_entry(bufferlist& bl);
   uint64_t get_write_pos() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return write_pos;
   }
   uint64_t get_write_safe_pos() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return safe_pos;
   }
   uint64_t get_read_pos() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return read_pos;
   }
   uint64_t get_expire_pos() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return expire_pos;
   }
   uint64_t get_trimmed_pos() const {
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return trimmed_pos;
   }
   size_t get_journal_envelope_size() const { 
-    ceph_assert(!ceph_mutex_is_locked_by_me(lock));
     lock_guard l(lock);
     return journal_stream.get_envelope_size(); 
   }
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 68bd76268ae..927c7e41329 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -48,7 +48,6 @@
 #include "include/function2.hpp"
 #include "include/neorados/RADOS_Decodable.hpp"
 
-#include "common/async/completion.h"
 #include "common/admin_socket.h"
 #include "common/ceph_time.h"
 #include "common/ceph_mutex.h"
@@ -1968,30 +1967,6 @@ public:
     }
   }
 
-  boost::asio::any_completion_handler<void(boost::system::error_code)>
-  OpCompletionVert(std::unique_ptr<ceph::async::Completion<
-		     void(boost::system::error_code)>> c) {
-    if (c)
-      return [c = std::move(c)](boost::system::error_code ec) mutable {
-	c->dispatch(std::move(c), ec);
-      };
-    else
-      return nullptr;
-  }
-
-  template<typename T>
-  boost::asio::any_completion_handler<void(boost::system::error_code, T)>
-  OpCompletionVert(std::unique_ptr<ceph::async::Completion<
-		     void(boost::system::error_code, T)>> c) {
-    if (c) {
-      return [c = std::move(c)](boost::system::error_code ec, T t) mutable {
-	c->dispatch(std::move(c), ec, std::move(t));
-      };
-    } else {
-      return nullptr;
-    }
-  }
-
   struct Op : public RefCountedObject {
     OSDSession *session = nullptr;
     int incarnation = 0;
@@ -3268,18 +3243,6 @@ public:
     return linger_watch(info, op, snapc, mtime, inbl,
 			OpContextVert<ceph::buffer::list>(onfinish, nullptr), objver);
   }
-  ceph_tid_t linger_watch(LingerOp *info,
-			  ObjectOperation& op,
-			  const SnapContext& snapc, ceph::real_time mtime,
-			  ceph::buffer::list& inbl,
-			  std::unique_ptr<ceph::async::Completion<
-			    void(boost::system::error_code,
-			         ceph::buffer::list)>> onfinish,
-			  version_t *objver) {
-    return linger_watch(info, op, snapc, mtime, inbl,
-			OpCompletionVert<ceph::buffer::list>(
-			  std::move(onfinish)), objver);
-  }
   ceph_tid_t linger_notify(LingerOp *info,
 			   ObjectOperation& op,
 			   snapid_t snap, ceph::buffer::list& inbl,
@@ -3295,17 +3258,6 @@ public:
 			 OpContextVert(onack, poutbl),
 			 objver);
   }
-  ceph_tid_t linger_notify(LingerOp *info,
-			   ObjectOperation& op,
-			   snapid_t snap, ceph::buffer::list& inbl,
-			   std::unique_ptr<ceph::async::Completion<
-			     void(boost::system::error_code,
-			          ceph::buffer::list)>> onack,
-			   version_t *objver) {
-    return linger_notify(info, op, snap, inbl,
-			 OpCompletionVert<ceph::buffer::list>(
-			   std::move(onack)), objver);
-  }
   tl::expected<ceph::timespan,
 	       boost::system::error_code> linger_check(LingerOp *info);
   void linger_cancel(LingerOp *info);  // releases a reference
@@ -3886,12 +3838,6 @@ public:
     create_pool_snap(pool, snapName,
 		     OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void create_pool_snap(
-    int64_t pool, std::string_view snapName,
-    std::unique_ptr<ceph::async::Completion<PoolOp::OpSig>> c) {
-    create_pool_snap(pool, snapName,
-		     OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
   void allocate_selfmanaged_snap(int64_t pool,
 				 boost::asio::any_completion_handler<
 				 void(boost::system::error_code,
@@ -3901,12 +3847,6 @@ public:
     allocate_selfmanaged_snap(pool,
 			      OpContextVert(c, psnapid));
   }
-  void allocate_selfmanaged_snap(int64_t pool,
-				 std::unique_ptr<ceph::async::Completion<void(
-				   boost::system::error_code, snapid_t)>> c) {
-    allocate_selfmanaged_snap(pool,
-			      OpCompletionVert<snapid_t>(std::move(c)));
-  }
   void delete_pool_snap(int64_t pool, std::string_view snapName,
 			decltype(PoolOp::onfinish)&& onfinish);
   void delete_pool_snap(int64_t pool, std::string_view snapName,
@@ -3914,12 +3854,6 @@ public:
     delete_pool_snap(pool, snapName,
 		     OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void delete_pool_snap(int64_t pool, std::string_view snapName,
-			std::unique_ptr<ceph::async::Completion<void(
-                          boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool_snap(pool, snapName,
-		     OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
 			       decltype(PoolOp::onfinish)&& onfinish);
@@ -3928,12 +3862,6 @@ public:
     delete_selfmanaged_snap(pool, snap,
 			    OpContextVert<ceph::buffer::list>(c, nullptr));
   }
-  void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
-			       std::unique_ptr<ceph::async::Completion<void(
-                                 boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_selfmanaged_snap(pool, snap,
-			    OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
 
   void create_pool(std::string_view name,
@@ -3945,25 +3873,12 @@ public:
 		OpContextVert<ceph::buffer::list>(onfinish, nullptr),
 		crush_rule);
   }
-  void create_pool(std::string_view name,
-		   std::unique_ptr<ceph::async::Completion<void(
-                     boost::system::error_code, ceph::buffer::list)>> c,
-		   int crush_rule=-1) {
-    create_pool(name,
-		OpCompletionVert<ceph::buffer::list>(std::move(c)),
-		crush_rule);
-  }
   void delete_pool(int64_t pool,
 		   decltype(PoolOp::onfinish)&& onfinish);
   void delete_pool(int64_t pool,
 		   Context* onfinish) {
     delete_pool(pool, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
   }
-  void delete_pool(int64_t pool,
-		   std::unique_ptr<ceph::async::Completion<void(
-                    boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool(pool, OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void delete_pool(std::string_view name,
 		   decltype(PoolOp::onfinish)&& onfinish);
@@ -3972,11 +3887,6 @@ public:
 		   Context* onfinish) {
     delete_pool(name, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
   }
-  void delete_pool(std::string_view name,
-		   std::unique_ptr<ceph::async::Completion<void(
-                     boost::system::error_code, ceph::buffer::list)>> c) {
-    delete_pool(name, OpCompletionVert<ceph::buffer::list>(std::move(c)));
-  }
 
   void handle_pool_op_reply(MPoolOpReply *m);
   int pool_op_cancel(ceph_tid_t tid, int r);
@@ -4026,11 +3936,6 @@ public:
 		    Context *onfinish) {
     get_fs_stats_(poolid, OpContextVert(onfinish, result));
   }
-  void get_fs_stats(std::optional<int64_t> poolid,
-		    std::unique_ptr<ceph::async::Completion<void(
-                      boost::system::error_code, struct ceph_statfs)>> c) {
-    get_fs_stats_(poolid, OpCompletionVert<struct ceph_statfs>(std::move(c)));
-  }
   int statfs_op_cancel(ceph_tid_t tid, int r);
   void _finish_statfs_op(StatfsOp *op, int r);
 
diff --git a/src/pybind/mgr/cephadm/ceph_volume.py b/src/pybind/mgr/cephadm/ceph_volume.py
new file mode 100644
index 00000000000..a270bb7028f
--- /dev/null
+++ b/src/pybind/mgr/cephadm/ceph_volume.py
@@ -0,0 +1,430 @@
+from cephadm.serve import CephadmServe
+from typing import List, TYPE_CHECKING, Any, Dict, Set, Tuple
+if TYPE_CHECKING:
+    from cephadm import CephadmOrchestrator
+
+
+class CephVolume:
+    def __init__(self, mgr: "CephadmOrchestrator", _inheritance: bool = False) -> None:
+        self.mgr: "CephadmOrchestrator" = mgr
+        if not _inheritance:
+            self.lvm_list: "CephVolumeLvmList" = CephVolumeLvmList(mgr)
+
+    def run_json(self, hostname: str, command: List[str]) -> Dict[str, Any]:
+        """Execute a JSON command on the specified hostname and return the result.
+
+        This method wraps the asynchronous execution of a JSON command on the
+        specified hostname, waiting for the command to complete. It utilizes the
+        `_run_json` method to perform the actual execution.
+
+        Args:
+            hostname (str): The hostname of the target node where the JSON command
+                            will be executed.
+            command (List[str]): A list of command arguments to be passed to the
+                                JSON command.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the JSON response from the
+                            executed command, which may include various data
+                            based on the command executed.
+        """
+        return self.mgr.wait_async(self._run_json(hostname, command))
+
+    def run(self, hostname: str, command: List[str], **kw: Any) -> Tuple[List[str], List[str], int]:
+        """Execute a command on the specified hostname and return the result.
+
+        This method wraps the asynchronous execution of a command on the
+        specified hostname, waiting for the command to complete. It utilizes the
+        `_run` method to perform the actual execution.
+
+        Args:
+            hostname (str): The hostname of the target node where the command
+                            will be executed.
+            command (List[str]): A list of command arguments to be passed to the
+                                command.
+            **kw (Any): Additional keyword arguments to customize the command
+                        execution.
+
+        Returns:
+            Tuple[List[str], List[str], int]: A tuple containing:
+                - A list of strings representing the standard output of the command.
+                - A list of strings representing the standard error output of the command.
+                - An integer representing the return code of the command execution.
+        """
+        return self.mgr.wait_async(self._run(hostname, command, **kw))
+
+    async def _run(self,
+                   hostname: str,
+                   command: List[str],
+                   **kw: Any) -> Tuple[List[str], List[str], int]:
+        """Execute a ceph-volume command on the specified hostname and return the result.
+
+        This asynchronous method constructs a ceph-volume command and then executes
+        it on the specified host.
+        The result of the command is returned in JSON format.
+
+        Args:
+            hostname (str): The hostname of the target node where the command will be executed.
+            command (List[str]): A list of command arguments to be passed to the Ceph command.
+            **kw (Any): Additional keyword arguments to customize the command execution.
+
+        Returns:
+            Tuple[List[str], List[str], int]: A tuple containing:
+                - A list of strings representing the standard output of the command.
+                - A list of strings representing the standard error output of the command.
+                - An integer representing the return code of the command execution.
+        """
+        cmd: List[str] = ['--']
+        cmd.extend(command)
+        result = await CephadmServe(self.mgr)._run_cephadm(
+            hostname, 'osd', 'ceph-volume',
+            cmd,
+            **kw)
+        return result
+
+    async def _run_json(self,
+                        hostname: str,
+                        command: List[str]) -> Dict[str, Any]:
+        """Execute a ceph-volume command on a specified hostname.
+
+        This asynchronous method constructs a ceph-volume command and then executes
+        it on the specified host.
+        The result of the command is returned in JSON format.
+
+        Args:
+            hostname (str): The hostname of the target node where the command will be executed.
+            command (List[str]): A list of command arguments to be passed to the Ceph command.
+
+        Returns:
+            Dict[str, Any]: The result of the command execution as a dictionary parsed from
+                            the JSON output.
+        """
+        cmd: List[str] = ['--']
+        cmd.extend(command)
+        result = await CephadmServe(self.mgr)._run_cephadm_json(
+            hostname, 'osd', 'ceph-volume',
+            cmd)
+        return result
+
+    def clear_replace_header(self, hostname: str, device: str) -> str:
+        """Clear the replacement header on a specified device for a given hostname.
+
+        This method checks if a replacement header exists on the specified device
+        and clears it if found. After clearing, it invalidates the cached device
+        information for the specified hostname and kicks the serve loop.
+
+        Args:
+            hostname (str): The hostname of the device on which the replacement header
+                            will be cleared. This is used to identify the specific
+                            device within the manager's context.
+            device (str): The path to the device (e.g., '/dev/sda') from which the
+                          replacement header will be cleared.
+
+        Returns:
+            str: A message indicating the result of the operation. It will either confirm
+                 that the replacement header was cleared or state that no replacement header
+                 was detected on the device.
+        """
+        output: str = ''
+        result = self.run(hostname, ['lvm',
+                                     'zap',
+                                     '--clear-replace-header',
+                                     device],
+                          error_ok=True)
+        out, err, rc = result
+        if not rc:
+            output = f'Replacement header cleared on {device}'
+            self.mgr.cache.invalidate_host_devices(hostname)
+            self.mgr._kick_serve_loop()
+        else:
+            plain_out: str = '\n'.join(out)
+            plain_err: str = '\n'.join(err)
+            output = f'No replacement header could be cleared on {device}.\n{plain_out}\n{plain_err}'
+        return output
+
+
+class CephVolumeLvmList(CephVolume):
+    def __init__(self, mgr: "CephadmOrchestrator") -> None:
+        super().__init__(mgr, True)
+        self.data: Dict[str, Any] = {}
+
+    def get_data(self, hostname: str) -> None:
+        """Execute the `ceph-volume lvm list` command to list LVM-based OSDs.
+
+        This asynchronous method interacts with the Ceph manager to retrieve
+        information about the Logical Volume Manager (LVM) devices associated
+        with the OSDs. It calls the `ceph-volume lvm list` command in JSON format
+        to gather relevant data.
+
+        Returns:
+            None: This method does not return a value. The retrieved data is
+                  stored in the `self.data` attribute for further processing.
+        """
+        self.data = self.run_json(hostname,
+                                  ['lvm', 'list', '--format', 'json'])
+
+    def devices_by_type(self, device_type: str) -> List[str]:
+        """Retrieve a list of devices of a specified type across all OSDs.
+
+        This method iterates through all OSDs and collects devices that match
+        the specified type (e.g., 'block', 'db', 'wal'). The resulting list
+        contains unique device paths.
+
+        Args:
+            device_type (str): The type of devices to retrieve. This should
+                               be one of the recognized device types such as
+                               'block', 'db', or 'wal'.
+
+        Returns:
+            List[str]: A list of unique device paths of the specified type
+                       found across all OSDs. If no devices of the specified
+                       type are found, an empty list is returned.
+        """
+        result: Set[str] = set()
+        for osd in self.osd_ids():
+            for lv in self.data.get(osd, []):
+                if lv.get('type') == device_type:
+                    result.update(lv.get('devices', []))
+        return list(result)
+
+    def block_devices(self) -> List[str]:
+        """List all block devices used by OSDs.
+
+        This method returns a list of devices that are used as 'block' devices
+        for storing the main OSD data.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'block' devices.
+        """
+        return self.devices_by_type('block')
+
+    def db_devices(self) -> List[str]:
+        """List all database (DB) devices used by OSDs.
+
+        This method returns a list of devices that are used as 'db' devices
+        for storing the database files associated with OSDs.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'db' devices.
+        """
+        return self.devices_by_type('db')
+
+    def wal_devices(self) -> List[str]:
+        """List all write-ahead log (WAL) devices used by OSDs.
+
+        This method returns a list of devices that are used as 'wal' devices
+        for storing write-ahead log data associated with OSDs.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'wal' devices.
+        """
+        return self.devices_by_type('wal')
+
+    def all_devices(self) -> List[str]:
+        """List all devices used by OSDs for 'block', 'db', or 'wal' purposes.
+
+        This method aggregates all devices that are currently used by the OSDs
+        in the system for the following device types:
+        - 'block' devices: Used to store the OSD's data.
+        - 'db' devices: Used for database purposes.
+        - 'wal' devices: Used for Write-Ahead Logging.
+
+        The returned list combines devices from all these categories.
+
+        Returns:
+            List[str]: A list of device paths (strings) that are used as 'block', 'db', or 'wal' devices.
+        """
+        return self.block_devices() + self.db_devices() + self.wal_devices()
+
+    def device_osd_mapping(self, device_type: str = '') -> Dict[str, Dict[str, List[str]]]:
+        """Create a mapping of devices to their corresponding OSD IDs based on device type.
+
+        This method serves as a 'proxy' function, designed to be called by the *_device_osd_mapping() methods.
+
+        This method iterates over the OSDs and their logical volumes to build a
+        dictionary that maps each device of the specified type to the list of
+        OSD IDs that use it. The resulting dictionary can be used to determine
+        which OSDs share a specific device.
+
+        Args:
+            device_type (str): The type of the device to filter by (e.g., 'block', 'db', or 'wal').
+                               If an empty string is provided, devices of all types will be included.
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dictionary where the keys are device
+            names and the values are dictionaries containing a list of OSD IDs
+            that use the corresponding device.
+
+        eg:
+        ```
+            {
+                '/dev/vda': {'osd_ids': ['0', '1']},
+                '/dev/vdb': {'osd_ids': ['2']}
+            }
+        ```
+
+        """
+        result: Dict[str, Dict[str, List[str]]] = {}
+        for osd in self.osd_ids():
+            for lv in self.data.get(osd, []):
+                if lv.get('type') == device_type or not device_type:
+                    for device in lv.get('devices', []):
+                        if device not in result:
+                            result[device] = {'osd_ids': []}
+                        result[device]['osd_ids'].append(osd)
+        return result
+
+    def block_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all block devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdb': {'osd_ids': ['0']},
+         '/dev/vdc': {'osd_ids': ['1']},
+         '/dev/vdf': {'osd_ids': ['2']},
+         '/dev/vde': {'osd_ids': ['3', '4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all block devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('block')
+
+    def db_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all db devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdv': {'osd_ids': ['0', '1', '2', '3']},
+         '/dev/vdx': {'osd_ids': ['4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all db devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('db')
+
+    def wal_device_osd_mapping(self) -> Dict[str, Dict[str, List[str]]]:
+        """Get a dictionnary with all wal devices and their corresponding
+        osd(s) id(s).
+
+        eg:
+        ```
+        {'/dev/vdy': {'osd_ids': ['0', '1', '2', '3']},
+         '/dev/vdz': {'osd_ids': ['4']}}
+         ```
+
+        Returns:
+            Dict[str, Dict[str, List[str]]]: A dict including all wal devices with their corresponding
+        osd id(s).
+        """
+        return self.device_osd_mapping('wal')
+
+    def is_shared_device(self, device: str) -> bool:
+        """Determines if a device is shared between multiple OSDs.
+
+        This method checks if a given device is shared by multiple OSDs for a specified device type
+        (such as 'block', 'db', or 'wal'). If the device is associated with more than one OSD,
+        it is considered shared.
+
+        Args:
+            device (str): The device path to check (e.g., '/dev/sda').
+            device_type (str): The type of the device (e.g., 'block', 'db', 'wal').
+
+        Raises:
+            RuntimeError: If the device is not valid or not found in the shared devices mapping.
+
+        Returns:
+            bool: True if the device is shared by more than one OSD, False otherwise.
+        """
+        device_osd_mapping = self.device_osd_mapping()
+        if not device or device not in device_osd_mapping:
+            raise RuntimeError('Not a valid device path.')
+        return len(device_osd_mapping[device]['osd_ids']) > 1
+
+    def is_block_device(self, device: str) -> bool:
+        """Check if a specified device is a block device.
+
+        This method checks if the specified device is included in the
+        list of block devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a block device,
+                  False otherwise.
+        """
+        return device in self.block_devices()
+
+    def is_db_device(self, device: str) -> bool:
+        """Check if a specified device is a DB device.
+
+        This method checks if the specified device is included in the
+        list of DB devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a DB device,
+                  False otherwise.
+        """
+        return device in self.db_devices()
+
+    def is_wal_device(self, device: str) -> bool:
+        """Check if a specified device is a WAL device.
+
+        This method checks if the specified device is included in the
+        list of WAL devices used by OSDs.
+
+        Args:
+            device (str): The path of the device to check.
+
+        Returns:
+            bool: True if the device is a WAL device,
+                  False otherwise.
+        """
+        return device in self.wal_devices()
+
+    def get_block_devices_from_osd_id(self, osd_id: str) -> List[str]:
+        """Retrieve the list of block devices associated with a given OSD ID.
+
+        This method looks up the specified OSD ID in the `data` attribute
+        and returns a list of devices that are of type 'block'. If there are
+        no devices of type 'block' for the specified OSD ID, an empty list is returned.
+
+        Args:
+            osd_id (str): The OSD ID for which to retrieve block devices.
+
+        Returns:
+            List[str]: A list of block device paths associated with the
+                       specified OSD ID. If no block devices are found,
+                       an empty list is returned.
+        """
+        result: List[str] = []
+        for lv in self.data.get(osd_id, []):
+            if lv.get('type') == 'block':
+                result = lv.get('devices', [])
+        return result
+
+    def osd_ids(self) -> List[str]:
+        """Retrieve the list of OSD IDs.
+
+        This method returns a list of OSD IDs by extracting the keys
+        from the `data` attribute, which is expected to contain
+        information about OSDs. If there is no data available, an
+        empty list is returned.
+
+        Returns:
+            List[str]: A list of OSD IDs. If no data is present,
+                       an empty list is returned.
+        """
+        result: List[str] = []
+        if self.data:
+            result = list(self.data.keys())
+        return result
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 5216c489064..1acc2ad2f2d 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -101,6 +101,7 @@ from .utils import CEPH_IMAGE_TYPES, RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES, forall
 from .configchecks import CephadmConfigChecks
 from .offline_watcher import OfflineHostWatcher
 from .tuned_profiles import TunedProfileUtils
+from .ceph_volume import CephVolume
 
 try:
     import asyncssh
@@ -135,13 +136,13 @@ DEFAULT_IMAGE = 'quay.io/ceph/ceph'
 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.51.0'
 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.7.0'
 DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.2.17'
-DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:3.0.0'
-DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:3.0.0'
+DEFAULT_LOKI_IMAGE = 'quay.io/ceph/loki:3.0.0'
+DEFAULT_PROMTAIL_IMAGE = 'quay.io/ceph/promtail:3.0.0'
 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.27.0'
 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/grafana:10.4.8'
 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4'
-DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
+DEFAULT_SNMP_GATEWAY_IMAGE = 'quay.io/ceph/snmp-notifier:v1.2.1'
 DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
 DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
 DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
@@ -446,7 +447,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         Option(
             'default_registry',
             type='str',
-            default='docker.io',
+            default='quay.io',
             desc='Search-registry to which we should normalize unqualified image names. '
                  'This is not the default registry',
         ),
@@ -764,6 +765,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         self.iscsi_service: IscsiService = cast(IscsiService, self.cephadm_services['iscsi'])
         self.nvmeof_service: NvmeofService = cast(NvmeofService, self.cephadm_services['nvmeof'])
         self.node_proxy_service: NodeProxy = cast(NodeProxy, self.cephadm_services['node-proxy'])
+        self.rgw_service: RgwService = cast(RgwService, self.cephadm_services['rgw'])
 
         self.scheduled_async_actions: List[Callable] = []
 
@@ -791,6 +793,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         # as part of the handling of stray daemons
         self.recently_altered_daemons: Dict[str, datetime.datetime] = {}
 
+        self.ceph_volume: CephVolume = CephVolume(self)
+
     def shutdown(self) -> None:
         self.log.debug('shutdown')
         self._worker_pool.close()
@@ -818,30 +822,33 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
         security_enabled = self.secure_monitoring_stack or mgmt_gw_enabled
         return security_enabled, mgmt_gw_enabled, oauth2_proxy_enabled
 
-    def get_mgmt_gw_internal_endpoint(self) -> Optional[str]:
+    def _get_mgmt_gw_endpoint(self, is_internal: bool) -> Optional[str]:
         mgmt_gw_daemons = self.cache.get_daemons_by_service('mgmt-gateway')
         if not mgmt_gw_daemons:
             return None
 
         dd = mgmt_gw_daemons[0]
         assert dd.hostname is not None
-        mgmt_gw_addr = self.get_fqdn(dd.hostname)
-        mgmt_gw_internal_endpoint = build_url(scheme='https', host=mgmt_gw_addr, port=MgmtGatewayService.INTERNAL_SERVICE_PORT)
-        return f'{mgmt_gw_internal_endpoint}/internal'
+        mgmt_gw_spec = cast(MgmtGatewaySpec, self.spec_store['mgmt-gateway'].spec)
+        mgmt_gw_addr = mgmt_gw_spec.virtual_ip if mgmt_gw_spec.virtual_ip is not None else self.get_fqdn(dd.hostname)
 
-    def get_mgmt_gw_external_endpoint(self) -> Optional[str]:
-        mgmt_gw_daemons = self.cache.get_daemons_by_service('mgmt-gateway')
-        if not mgmt_gw_daemons:
-            return None
+        if is_internal:
+            mgmt_gw_port: Optional[int] = MgmtGatewayService.INTERNAL_SERVICE_PORT
+            protocol = 'https'
+            endpoint_suffix = '/internal'
+        else:
+            mgmt_gw_port = dd.ports[0] if dd.ports else None
+            protocol = 'http' if mgmt_gw_spec.disable_https else 'https'
+            endpoint_suffix = ''
 
-        dd = mgmt_gw_daemons[0]
-        assert dd.hostname is not None
-        mgmt_gw_port = dd.ports[0] if dd.ports else None
-        mgmt_gw_addr = self.get_fqdn(dd.hostname)
-        mgmt_gw_spec = cast(MgmtGatewaySpec, self.spec_store['mgmt-gateway'].spec)
-        protocol = 'http' if mgmt_gw_spec.disable_https else 'https'
-        mgmt_gw_external_endpoint = build_url(scheme=protocol, host=mgmt_gw_addr, port=mgmt_gw_port)
-        return mgmt_gw_external_endpoint
+        mgmt_gw_endpoint = build_url(scheme=protocol, host=mgmt_gw_addr, port=mgmt_gw_port)
+        return f'{mgmt_gw_endpoint}{endpoint_suffix}'
+
+    def get_mgmt_gw_internal_endpoint(self) -> Optional[str]:
+        return self._get_mgmt_gw_endpoint(is_internal=True)
+
+    def get_mgmt_gw_external_endpoint(self) -> Optional[str]:
+        return self._get_mgmt_gw_endpoint(is_internal=False)
 
     def _get_cephadm_binary_path(self) -> str:
         import hashlib
@@ -3000,8 +3007,16 @@ Then run the following:
                     daemon_names.append(dd.name())
             return daemon_names
 
-        alertmanager_user, alertmanager_password = self._get_alertmanager_credentials()
-        prometheus_user, prometheus_password = self._get_prometheus_credentials()
+        prom_cred_hash = None
+        alertmgr_cred_hash = None
+        security_enabled, mgmt_gw_enabled, _ = self._get_security_config()
+        if security_enabled:
+            alertmanager_user, alertmanager_password = self._get_alertmanager_credentials()
+            prometheus_user, prometheus_password = self._get_prometheus_credentials()
+            if prometheus_user and prometheus_password:
+                prom_cred_hash = f'{utils.md5_hash(prometheus_user + prometheus_password)}'
+            if alertmanager_user and alertmanager_password:
+                alertmgr_cred_hash = f'{utils.md5_hash(alertmanager_user + alertmanager_password)}'
 
         deps = []
         if daemon_type == 'haproxy':
@@ -3048,9 +3063,10 @@ Then run the following:
             else:
                 deps = [self.get_mgr_ip()]
         elif daemon_type == 'prometheus':
-            # for prometheus we add the active mgr as an explicit dependency,
-            # this way we force a redeploy after a mgr failover
-            deps.append(self.get_active_mgr().name())
+            if not mgmt_gw_enabled:
+                # for prometheus we add the active mgr as an explicit dependency,
+                # this way we force a redeploy after a mgr failover
+                deps.append(self.get_active_mgr().name())
             deps.append(str(self.get_module_option_ex('prometheus', 'server_port', 9283)))
             deps.append(str(self.service_discovery_port))
             # prometheus yaml configuration file (generated by prometheus.yml.j2) contains
@@ -3067,22 +3083,20 @@ Then run the following:
             deps += [d.name() for d in self.cache.get_daemons_by_service('ceph-exporter')]
             deps += [d.name() for d in self.cache.get_daemons_by_service('mgmt-gateway')]
             deps += [d.name() for d in self.cache.get_daemons_by_service('oauth2-proxy')]
-            security_enabled, _, _ = self._get_security_config()
-            if security_enabled:
-                if prometheus_user and prometheus_password:
-                    deps.append(f'{hash(prometheus_user + prometheus_password)}')
-                if alertmanager_user and alertmanager_password:
-                    deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+            if prom_cred_hash is not None:
+                deps.append(prom_cred_hash)
+            if alertmgr_cred_hash is not None:
+                deps.append(alertmgr_cred_hash)
         elif daemon_type == 'grafana':
             deps += get_daemon_names(['prometheus', 'loki', 'mgmt-gateway', 'oauth2-proxy'])
-            security_enabled, _, _ = self._get_security_config()
-            if security_enabled and prometheus_user and prometheus_password:
-                deps.append(f'{hash(prometheus_user + prometheus_password)}')
+            if prom_cred_hash is not None:
+                deps.append(prom_cred_hash)
         elif daemon_type == 'alertmanager':
-            deps += get_daemon_names(['mgr', 'alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy'])
-            security_enabled, _, _ = self._get_security_config()
-            if security_enabled and alertmanager_user and alertmanager_password:
-                deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+            deps += get_daemon_names(['alertmanager', 'snmp-gateway', 'mgmt-gateway', 'oauth2-proxy'])
+            if not mgmt_gw_enabled:
+                deps += get_daemon_names(['mgr'])
+            if alertmgr_cred_hash is not None:
+                deps.append(alertmgr_cred_hash)
         elif daemon_type == 'promtail':
             deps += get_daemon_names(['loki'])
         elif daemon_type in ['ceph-exporter', 'node-exporter']:
@@ -3094,9 +3108,7 @@ Then run the following:
                 deps.append(build_url(host=dd.hostname, port=port).lstrip('/'))
             deps = sorted(deps)
         elif daemon_type == 'mgmt-gateway':
-            # url_prefix for monitoring daemons depends on the presence of mgmt-gateway
-            # while dashboard urls depend on the mgr daemons
-            deps += get_daemon_names(['mgr', 'grafana', 'prometheus', 'alertmanager', 'oauth2-proxy'])
+            deps = MgmtGatewayService.get_dependencies(self)
         else:
             # this daemon type doesn't need deps mgmt
             pass
@@ -3609,7 +3621,12 @@ Then run the following:
         return "Scheduled %s update..." % spec.service_name()
 
     @handle_orch_error
-    def apply(self, specs: Sequence[GenericSpec], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence[GenericSpec],
+        no_overwrite: bool = False,
+        continue_on_error: bool = True
+    ) -> List[str]:
         results = []
         for spec in specs:
             if no_overwrite:
@@ -3621,7 +3638,14 @@ Then run the following:
                     results.append('Skipped %s service spec. To change %s spec omit --no-overwrite flag'
                                    % (cast(ServiceSpec, spec).service_name(), cast(ServiceSpec, spec).service_name()))
                     continue
-            results.append(self._apply(spec))
+            try:
+                res = self._apply(spec)
+                results.append(res)
+            except Exception as e:
+                if continue_on_error:
+                    results.append(f'Failed to apply spec for {spec}: {str(e)}')
+                else:
+                    raise e
         return results
 
     @handle_orch_error
@@ -3828,8 +3852,55 @@ Then run the following:
         return self.upgrade.upgrade_stop()
 
     @handle_orch_error
+    def replace_device(self,
+                       hostname: str,
+                       device: str,
+                       clear: bool = False,
+                       yes_i_really_mean_it: bool = False) -> Any:
+        output: str = ''
+
+        self.ceph_volume.lvm_list.get_data(hostname=hostname)
+
+        if clear:
+            output = self.ceph_volume.clear_replace_header(hostname, device)
+        else:
+            osds_to_zap: List[str] = []
+            if hostname not in list(self.inventory.keys()):
+                raise OrchestratorError(f'{hostname} invalid host.')
+
+            if device not in self.ceph_volume.lvm_list.all_devices():
+                raise OrchestratorError(f"{device} doesn't appear to be used for an OSD, not a valid device in {hostname}.")
+
+            device_osd_mapping = self.ceph_volume.lvm_list.device_osd_mapping()
+            osds_to_zap = device_osd_mapping[device]['osd_ids']
+
+            if self.ceph_volume.lvm_list.is_shared_device(device):
+                if not yes_i_really_mean_it:
+                    raise OrchestratorError(f'{device} is a shared device.\n'
+                                            f'Replacing {device} implies destroying OSD(s): {osds_to_zap}.\n'
+                                            'Please, *be very careful*, this can be a very dangerous operation.\n'
+                                            'If you know what you are doing, pass --yes-i-really-mean-it')
+            if not self.to_remove_osds.rm_util.safe_to_destroy([int(osd_id) for osd_id in osds_to_zap]):
+                raise OrchestratorError(f"Destroying OSD(s) {osds_to_zap} would cause some PGs to be undersized/degraded.\n"
+                                        'Refusing to proceed.')
+            replace_block: bool = self.ceph_volume.lvm_list.is_block_device(device)
+            replace_db: bool = self.ceph_volume.lvm_list.is_db_device(device)
+            replace_wal: bool = self.ceph_volume.lvm_list.is_wal_device(device)
+
+            self.remove_osds(list(osds_to_zap),
+                             replace_block=replace_block,
+                             replace_db=replace_db,
+                             replace_wal=replace_wal)
+
+            output = f'Scheduled to destroy osds: {osds_to_zap} and mark {device} as being replaced.'
+        return output
+
+    @handle_orch_error
     def remove_osds(self, osd_ids: List[str],
                     replace: bool = False,
+                    replace_block: bool = False,
+                    replace_db: bool = False,
+                    replace_wal: bool = False,
                     force: bool = False,
                     zap: bool = False,
                     no_destroy: bool = False) -> str:
@@ -3852,6 +3923,9 @@ Then run the following:
             try:
                 self.to_remove_osds.enqueue(OSD(osd_id=int(daemon.daemon_id),
                                                 replace=replace,
+                                                replace_block=replace_block,
+                                                replace_db=replace_db,
+                                                replace_wal=replace_wal,
                                                 force=force,
                                                 zap=zap,
                                                 no_destroy=no_destroy,
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index c6212c9efb8..4a7959ae045 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -96,7 +96,10 @@ class CephadmServe:
                 if not self.mgr.paused:
                     self._run_async_actions()
 
-                    self.mgr.to_remove_osds.process_removal_queue()
+                    removal_queue_result = self.mgr.to_remove_osds.process_removal_queue()
+                    self.log.debug(f'process_removal_queue() returned = {removal_queue_result}')
+                    if removal_queue_result:
+                        continue
 
                     self.mgr.migration.migrate()
                     if self.mgr.migration.is_migration_ongoing():
@@ -950,6 +953,10 @@ class CephadmServe:
                     )
                     continue
 
+                # set multisite config before deploying the rgw daemon
+                if service_type == 'rgw':
+                    self.mgr.rgw_service.set_realm_zg_zone(cast(RGWSpec, spec))
+
                 # deploy new daemon
                 daemon_id = slot.name
 
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index eb9a1c838a6..9043577bc5a 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -984,10 +984,9 @@ class RgwService(CephService):
     def allow_colo(self) -> bool:
         return True
 
-    def config(self, spec: RGWSpec) -> None:  # type: ignore
+    def set_realm_zg_zone(self, spec: RGWSpec) -> None:
         assert self.TYPE == spec.service_type
 
-        # set rgw_realm rgw_zonegroup and rgw_zone, if present
         if spec.rgw_realm:
             ret, out, err = self.mgr.check_mon_command({
                 'prefix': 'config set',
@@ -1010,6 +1009,12 @@ class RgwService(CephService):
                 'value': spec.rgw_zone,
             })
 
+    def config(self, spec: RGWSpec) -> None:  # type: ignore
+        assert self.TYPE == spec.service_type
+
+        # set rgw_realm rgw_zonegroup and rgw_zone, if present
+        self.set_realm_zg_zone(spec)
+
         if spec.generate_cert and not spec.rgw_frontend_ssl_certificate:
             # generate a self-signed cert for the rgw service
             cert, key = self.mgr.cert_mgr.ssl_certs.generate_root_cert(custom_san_list=spec.zonegroup_hostnames)
diff --git a/src/pybind/mgr/cephadm/services/ingress.py b/src/pybind/mgr/cephadm/services/ingress.py
index a17000cd632..7381ef67d7e 100644
--- a/src/pybind/mgr/cephadm/services/ingress.py
+++ b/src/pybind/mgr/cephadm/services/ingress.py
@@ -241,7 +241,12 @@ class IngressService(CephService):
         if spec.keepalived_password:
             password = spec.keepalived_password
 
-        daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
+        if spec.keepalive_only:
+            # when keepalive_only instead of haproxy, we have to monitor the backend service daemons
+            if spec.backend_service is not None:
+                daemons = self.mgr.cache.get_daemons_by_service(spec.backend_service)
+        else:
+            daemons = self.mgr.cache.get_daemons_by_service(spec.service_name())
 
         if not daemons and not spec.keepalive_only:
             raise OrchestratorError(
@@ -297,6 +302,10 @@ class IngressService(CephService):
                     port = d.ports[1]   # monitoring port
                     host_ip = d.ip or self.mgr.inventory.get_addr(d.hostname)
                     script = f'/usr/bin/curl {build_url(scheme="http", host=host_ip, port=port)}/health'
+                elif d.daemon_type == 'mgmt-gateway':
+                    mgmt_gw_port = d.ports[0] if d.ports else None
+                    host_ip = d.ip or self.mgr.inventory.get_addr(d.hostname)
+                    script = f'/usr/bin/curl -k {build_url(scheme="https", host=host_ip, port=mgmt_gw_port)}/health'
         assert script
 
         states = []
diff --git a/src/pybind/mgr/cephadm/services/mgmt_gateway.py b/src/pybind/mgr/cephadm/services/mgmt_gateway.py
index 1943264025e..0897ce99ff7 100644
--- a/src/pybind/mgr/cephadm/services/mgmt_gateway.py
+++ b/src/pybind/mgr/cephadm/services/mgmt_gateway.py
@@ -1,10 +1,12 @@
 import logging
-from typing import List, Any, Tuple, Dict, cast, Optional
+from typing import List, Any, Tuple, Dict, cast, TYPE_CHECKING
 
 from orchestrator import DaemonDescription
 from ceph.deployment.service_spec import MgmtGatewaySpec, GrafanaSpec
 from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec, get_dashboard_endpoints
 
+if TYPE_CHECKING:
+    from ..module import CephadmOrchestrator
 
 logger = logging.getLogger(__name__)
 
@@ -36,10 +38,11 @@ class MgmtGatewayService(CephadmService):
         # if empty list provided, return empty Daemon Desc
         return DaemonDescription()
 
-    def get_oauth2_service_url(self) -> Optional[str]:
-        # TODO(redo): check how can we create several servers for HA
-        oauth2_servers = self.get_service_endpoints('oauth2-proxy')
-        return f'https://{oauth2_servers[0]}' if oauth2_servers else None
+    def get_mgmt_gw_ips(self, svc_spec: MgmtGatewaySpec, daemon_spec: CephadmDaemonDeploySpec) -> List[str]:
+        mgmt_gw_ips = [self.mgr.inventory.get_addr(daemon_spec.host)]
+        if svc_spec.virtual_ip is not None:
+            mgmt_gw_ips.append(svc_spec.virtual_ip)
+        return mgmt_gw_ips
 
     def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
         # we adjust the standby behaviour so rev-proxy can pick correctly the active instance
@@ -56,9 +59,9 @@ class MgmtGatewayService(CephadmService):
                 key = svc_spec.ssl_certificate_key
             else:
                 # not provided on the spec, let's generate self-sigend certificates
-                addr = self.mgr.inventory.get_addr(daemon_spec.host)
+                ips = self.get_mgmt_gw_ips(svc_spec, daemon_spec)
                 host_fqdn = self.mgr.get_fqdn(daemon_spec.host)
-                cert, key = self.mgr.cert_mgr.generate_cert(host_fqdn, addr)
+                cert, key = self.mgr.cert_mgr.generate_cert(host_fqdn, ips)
             # save certificates
             if cert and key:
                 self.mgr.cert_key_store.save_cert('mgmt_gw_cert', cert)
@@ -67,23 +70,33 @@ class MgmtGatewayService(CephadmService):
                 logger.error("Failed to obtain certificate and key from mgmt-gateway.")
         return cert, key
 
-    def get_internal_certificates(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]:
-        node_ip = self.mgr.inventory.get_addr(daemon_spec.host)
+    def get_internal_certificates(self, svc_spec: MgmtGatewaySpec, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[str, str]:
+        ips = self.get_mgmt_gw_ips(svc_spec, daemon_spec)
         host_fqdn = self.mgr.get_fqdn(daemon_spec.host)
-        return self.mgr.cert_mgr.generate_cert(host_fqdn, node_ip)
+        return self.mgr.cert_mgr.generate_cert(host_fqdn, ips)
 
-    def get_mgmt_gateway_deps(self) -> List[str]:
-        # url_prefix for the following services depends on the presence of mgmt-gateway
-        deps: List[str] = []
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('prometheus')]
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('alertmanager')]
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('grafana')]
-        deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')]
+    def get_service_discovery_endpoints(self) -> List[str]:
+        sd_endpoints = []
         for dd in self.mgr.cache.get_daemons_by_service('mgr'):
-            # we consider mgr a dep even if the dashboard is disabled
-            # in order to be consistent with _calc_daemon_deps().
-            deps.append(dd.name())
+            assert dd.hostname is not None
+            addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+            sd_endpoints.append(f"{addr}:{self.mgr.service_discovery_port}")
+        return sd_endpoints
 
+    @staticmethod
+    def get_dependencies(mgr: "CephadmOrchestrator") -> List[str]:
+        # url_prefix for the following services depends on the presence of mgmt-gateway
+        deps = [
+            f'{d.name()}:{d.ports[0]}' if d.ports else d.name()
+            for service in ['prometheus', 'alertmanager', 'grafana', 'oauth2-proxy']
+            for d in mgr.cache.get_daemons_by_service(service)
+        ]
+        # dashboard and service discovery urls depend on the mgr daemons
+        deps += [
+            f'{d.name()}'
+            for service in ['mgr']
+            for d in mgr.cache.get_daemons_by_service(service)
+        ]
         return deps
 
     def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
@@ -94,6 +107,8 @@ class MgmtGatewayService(CephadmService):
         prometheus_endpoints = self.get_service_endpoints('prometheus')
         alertmanager_endpoints = self.get_service_endpoints('alertmanager')
         grafana_endpoints = self.get_service_endpoints('grafana')
+        oauth2_proxy_endpoints = self.get_service_endpoints('oauth2-proxy')
+        service_discovery_endpoints = self.get_service_discovery_endpoints()
         try:
             grafana_spec = cast(GrafanaSpec, self.mgr.spec_store['grafana'].spec)
             grafana_protocol = grafana_spec.protocol
@@ -104,7 +119,9 @@ class MgmtGatewayService(CephadmService):
             'dashboard_endpoints': dashboard_endpoints,
             'prometheus_endpoints': prometheus_endpoints,
             'alertmanager_endpoints': alertmanager_endpoints,
-            'grafana_endpoints': grafana_endpoints
+            'grafana_endpoints': grafana_endpoints,
+            'oauth2_proxy_endpoints': oauth2_proxy_endpoints,
+            'service_discovery_endpoints': service_discovery_endpoints
         }
         server_context = {
             'spec': svc_spec,
@@ -117,11 +134,12 @@ class MgmtGatewayService(CephadmService):
             'prometheus_endpoints': prometheus_endpoints,
             'alertmanager_endpoints': alertmanager_endpoints,
             'grafana_endpoints': grafana_endpoints,
-            'oauth2_proxy_url': self.get_oauth2_service_url(),
+            'service_discovery_endpoints': service_discovery_endpoints,
+            'enable_oauth2_proxy': bool(oauth2_proxy_endpoints),
         }
 
         cert, key = self.get_external_certificates(svc_spec, daemon_spec)
-        internal_cert, internal_pkey = self.get_internal_certificates(daemon_spec)
+        internal_cert, internal_pkey = self.get_internal_certificates(svc_spec, daemon_spec)
         daemon_config = {
             "files": {
                 "nginx.conf": self.mgr.template.render(self.SVC_TEMPLATE_PATH, main_context),
@@ -136,7 +154,7 @@ class MgmtGatewayService(CephadmService):
             daemon_config["files"]["nginx.crt"] = cert
             daemon_config["files"]["nginx.key"] = key
 
-        return daemon_config, sorted(self.get_mgmt_gateway_deps())
+        return daemon_config, sorted(MgmtGatewayService.get_dependencies(self.mgr))
 
     def pre_remove(self, daemon: DaemonDescription) -> None:
         """
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index 6a57e3b31ef..1b9cf618570 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -8,10 +8,11 @@ from mgr_module import HandleCommandResult
 
 from orchestrator import DaemonDescription
 from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, \
-    SNMPGatewaySpec, PrometheusSpec
+    SNMPGatewaySpec, PrometheusSpec, MgmtGatewaySpec
 from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec, get_dashboard_urls
 from mgr_util import verify_tls, ServerConfigException, build_url, get_cert_issuer_info, password_hash
 from ceph.deployment.utils import wrap_ipv6
+from .. import utils
 
 logger = logging.getLogger(__name__)
 
@@ -57,15 +58,17 @@ class GrafanaService(CephadmService):
                 daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to}
                 grafana_ip = ip_to_bind_to
 
-        mgmt_gw_ip = None
         domain = self.mgr.get_fqdn(daemon_spec.host)
+        mgmt_gw_ips = []
         if mgmt_gw_enabled:
             mgmt_gw_daemons = self.mgr.cache.get_daemons_by_service('mgmt-gateway')
             if mgmt_gw_daemons:
                 dd = mgmt_gw_daemons[0]
                 assert dd.hostname
-                domain = self.mgr.get_fqdn(dd.hostname)
-                mgmt_gw_ip = self.mgr.inventory.get_addr(dd.hostname)
+                mgmt_gw_spec = cast(MgmtGatewaySpec, self.mgr.spec_store['mgmt-gateway'].spec)
+                # TODO(redo): should we resolve the virtual_ip to a name if possible?
+                domain = mgmt_gw_spec.virtual_ip or self.mgr.get_fqdn(dd.hostname)  # give prio to VIP if configured
+                mgmt_gw_ips = [self.mgr.inventory.get_addr(dd.hostname) for dd in mgmt_gw_daemons]  # type: ignore
 
         return self.mgr.template.render('services/grafana/grafana.ini.j2', {
             'anonymous_access': spec.anonymous_access,
@@ -76,7 +79,7 @@ class GrafanaService(CephadmService):
             'domain': domain,
             'mgmt_gw_enabled': mgmt_gw_enabled,
             'oauth2_enabled': oauth2_enabled,
-            'mgmt_gw_ip': mgmt_gw_ip,
+            'mgmt_gw_ips': ','.join(mgmt_gw_ips),
         })
 
     def calculate_grafana_deps(self, security_enabled: bool) -> List[str]:
@@ -87,7 +90,7 @@ class GrafanaService(CephadmService):
         # in case security is enabled we have to reconfig when prom user/pass changes
         prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
         if security_enabled and prometheus_user and prometheus_password:
-            deps.append(f'{hash(prometheus_user + prometheus_password)}')
+            deps.append(f'{utils.md5_hash(prometheus_user + prometheus_password)}')
 
         # adding a dependency for mgmt-gateway because the usage of url_prefix relies on its presence.
         # another dependency is added for oauth-proxy as Grafana login is delegated to this service when enabled.
@@ -311,17 +314,18 @@ class AlertmanagerService(CephadmService):
         # add a dependency since enbling basic-auth (or not) depends on the existence of 'oauth2-proxy'
         deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('oauth2-proxy')]
 
-        # scan all mgrs to generate deps and to get standbys too.
-        for dd in self.mgr.cache.get_daemons_by_service('mgr'):
-            # we consider mgr a dep even if the dashboard is disabled
-            # in order to be consistent with _calc_daemon_deps().
-            deps.append(dd.name())
-
         security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config()
         if mgmt_gw_enabled:
             dashboard_urls = [f'{self.mgr.get_mgmt_gw_internal_endpoint()}/dashboard']
         else:
             dashboard_urls = get_dashboard_urls(self)
+            # scan all mgrs to generate deps and to get standbys too.
+            for dd in self.mgr.cache.get_daemons_by_service('mgr'):
+                # we consider mgr a dep even if the dashboard is disabled
+                # in order to be consistent with _calc_daemon_deps().
+                # when mgmt_gw is enabled there's no need for mgr dep as
+                # mgmt-gw wil route to the active mgr automatically
+                deps.append(dd.name())
 
         snmp_gateway_urls: List[str] = []
         for dd in self.mgr.cache.get_daemons_by_service('snmp-gateway'):
@@ -354,7 +358,7 @@ class AlertmanagerService(CephadmService):
         if security_enabled:
             alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
             if alertmanager_user and alertmanager_password:
-                deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+                deps.append(f'{utils.md5_hash(alertmanager_user + alertmanager_password)}')
             cert, key = self.get_alertmanager_certificates(daemon_spec)
             context = {
                 'enable_mtls': mgmt_gw_enabled,
@@ -489,8 +493,14 @@ class PrometheusService(CephadmService):
         security_enabled, mgmt_gw_enabled, oauth2_enabled = self.mgr._get_security_config()
         port = self.mgr.service_discovery_port
         mgr_addr = wrap_ipv6(self.mgr.get_mgr_ip())
+
         protocol = 'https' if security_enabled else 'http'
-        srv_end_point = f'{protocol}://{mgr_addr}:{port}/sd/prometheus/sd-config?'
+        self.mgr.get_mgmt_gw_internal_endpoint()
+        if mgmt_gw_enabled:
+            service_discovery_url_prefix = f'{self.mgr.get_mgmt_gw_internal_endpoint()}'
+        else:
+            service_discovery_url_prefix = f'{protocol}://{mgr_addr}:{port}'
+        srv_end_point = f'{service_discovery_url_prefix}/sd/prometheus/sd-config?'
 
         node_exporter_cnt = len(self.mgr.cache.get_daemons_by_service('node-exporter'))
         alertmgr_cnt = len(self.mgr.cache.get_daemons_by_service('alertmanager'))
@@ -617,18 +627,23 @@ class PrometheusService(CephadmService):
         port = cast(int, self.mgr.get_module_option_ex('prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT))
         deps.append(str(port))
         deps.append(str(self.mgr.service_discovery_port))
-        # add an explicit dependency on the active manager. This will force to
-        # re-deploy prometheus if the mgr has changed (due to a fail-over i.e).
-        deps.append(self.mgr.get_active_mgr().name())
         deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}')
-        security_enabled, _, _ = self.mgr._get_security_config()
+        security_enabled, mgmt_gw_enabled, _ = self.mgr._get_security_config()
+
+        if not mgmt_gw_enabled:
+            # add an explicit dependency on the active manager. This will force to
+            # re-deploy prometheus if the mgr has changed (due to a fail-over i.e).
+            # when mgmt_gw is enabled there's no need for such dep as mgmt-gw wil
+            # route to the active mgr automatically
+            deps.append(self.mgr.get_active_mgr().name())
+
         if security_enabled:
             alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
             prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
             if prometheus_user and prometheus_password:
-                deps.append(f'{hash(prometheus_user + prometheus_password)}')
+                deps.append(f'{utils.md5_hash(prometheus_user + prometheus_password)}')
             if alertmanager_user and alertmanager_password:
-                deps.append(f'{hash(alertmanager_user + alertmanager_password)}')
+                deps.append(f'{utils.md5_hash(alertmanager_user + alertmanager_password)}')
 
         # add a dependency since url_prefix depends on the existence of mgmt-gateway
         deps += [d.name() for d in self.mgr.cache.get_daemons_by_service('mgmt-gateway')]
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index 8b15aace373..4451e29878d 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -55,7 +55,7 @@ class NvmeofService(CephService):
             'addr': addr,
             'discovery_addr': discovery_addr,
             'port': spec.port,
-            'spdk_log_level': 'WARNING',
+            'spdk_log_level': '',
             'rpc_socket_dir': '/var/tmp/',
             'rpc_socket_name': 'spdk.sock',
             'transport_tcp_options': transport_tcp_options,
@@ -123,10 +123,9 @@ class NvmeofService(CephService):
             gateways = json.loads(out)['gateways']
             cmd_dicts = []
 
-            spec = cast(NvmeofServiceSpec,
-                        self.mgr.spec_store.all_specs.get(daemon_descrs[0].service_name(), None))
-
             for dd in daemon_descrs:
+                spec = cast(NvmeofServiceSpec,
+                            self.mgr.spec_store.all_specs.get(dd.service_name(), None))
                 service_name = dd.service_name()
                 if dd.hostname is None:
                     err_msg = ('Trying to config_dashboard nvmeof but no hostname is defined')
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py
index 9b09b8c9f49..80bf92772c4 100644
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -551,6 +551,12 @@ class RemoveUtil(object):
         "Zaps all devices that are associated with an OSD"
         if osd.hostname is not None:
             cmd = ['--', 'lvm', 'zap', '--osd-id', str(osd.osd_id)]
+            if osd.replace_block:
+                cmd.append('--replace-block')
+            if osd.replace_db:
+                cmd.append('--replace-db')
+            if osd.replace_wal:
+                cmd.append('--replace-wal')
             if not osd.no_destroy:
                 cmd.append('--destroy')
             with self.mgr.async_timeout_handler(osd.hostname, f'cephadm ceph-volume {" ".join(cmd)}'):
@@ -618,6 +624,9 @@ class OSD:
                  started: bool = False,
                  stopped: bool = False,
                  replace: bool = False,
+                 replace_block: bool = False,
+                 replace_db: bool = False,
+                 replace_wal: bool = False,
                  force: bool = False,
                  hostname: Optional[str] = None,
                  zap: bool = False,
@@ -649,6 +658,12 @@ class OSD:
 
         # If this is a replace or remove operation
         self.replace = replace
+        # If this is a block device replacement
+        self.replace_block = replace_block
+        # If this is a db device replacement
+        self.replace_db = replace_db
+        # If this is a wal device replacement
+        self.replace_wal = replace_wal
         # If we wait for the osd to be drained
         self.force = force
         # The name of the node
@@ -676,7 +691,7 @@ class OSD:
         if self.stopped:
             logger.debug(f"Won't start draining {self}. OSD draining is stopped.")
             return False
-        if self.replace:
+        if self.any_replace_params:
             self.rm_util.set_osd_flag([self], 'out')
         else:
             self.rm_util.reweight_osd(self, 0.0)
@@ -686,7 +701,7 @@ class OSD:
         return True
 
     def stop_draining(self) -> bool:
-        if self.replace:
+        if self.any_replace_params:
             self.rm_util.set_osd_flag([self], 'in')
         else:
             if self.original_weight:
@@ -764,6 +779,9 @@ class OSD:
         out['draining'] = self.draining
         out['stopped'] = self.stopped
         out['replace'] = self.replace
+        out['replace_block'] = self.replace_block
+        out['replace_db'] = self.replace_db
+        out['replace_wal'] = self.replace_wal
         out['force'] = self.force
         out['zap'] = self.zap
         out['hostname'] = self.hostname  # type: ignore
@@ -789,6 +807,13 @@ class OSD:
             inp['hostname'] = hostname
         return cls(**inp)
 
+    @property
+    def any_replace_params(self) -> bool:
+        return any([self.replace,
+                    self.replace_block,
+                    self.replace_db,
+                    self.replace_wal])
+
     def __hash__(self) -> int:
         return hash(self.osd_id)
 
@@ -812,7 +837,7 @@ class OSDRemovalQueue(object):
         # network calls, like mon commands.
         self.lock = Lock()
 
-    def process_removal_queue(self) -> None:
+    def process_removal_queue(self) -> bool:
         """
         Performs actions in the _serve() loop to remove an OSD
         when criteria is met.
@@ -820,6 +845,8 @@ class OSDRemovalQueue(object):
         we can't hold self.lock, as we're calling _remove_daemon in the loop
         """
 
+        result: bool = False
+
         # make sure that we don't run on OSDs that are not in the cluster anymore.
         self.cleanup()
 
@@ -863,16 +890,23 @@ class OSDRemovalQueue(object):
             if self.mgr.cache.has_daemon(f'osd.{osd.osd_id}'):
                 CephadmServe(self.mgr)._remove_daemon(f'osd.{osd.osd_id}', osd.hostname)
                 logger.info(f"Successfully removed {osd} on {osd.hostname}")
+                result = True
             else:
                 logger.info(f"Daemon {osd} on {osd.hostname} was already removed")
 
-            if osd.replace:
+            any_replace_params: bool = any([osd.replace,
+                                            osd.replace_block,
+                                            osd.replace_db,
+                                            osd.replace_wal])
+            if any_replace_params:
                 # mark destroyed in osdmap
                 if not osd.destroy():
                     raise orchestrator.OrchestratorError(
                         f"Could not destroy {osd}")
                 logger.info(
                     f"Successfully destroyed old {osd} on {osd.hostname}; ready for replacement")
+                if any_replace_params:
+                    osd.zap = True
             else:
                 # purge from osdmap
                 if not osd.purge():
@@ -884,7 +918,7 @@ class OSDRemovalQueue(object):
                 logger.info(f"Zapping devices for {osd} on {osd.hostname}")
                 osd.do_zap()
                 logger.info(f"Successfully zapped devices for {osd} on {osd.hostname}")
-
+            self.mgr.cache.invalidate_host_devices(osd.hostname)
             logger.debug(f"Removing {osd} from the queue.")
 
         # self could change while this is processing (osds get added from the CLI)
@@ -893,6 +927,7 @@ class OSDRemovalQueue(object):
         with self.lock:
             self.osds.intersection_update(new_queue)
             self._save_to_store()
+        return result
 
     def cleanup(self) -> None:
         # OSDs can always be cleaned up manually. This ensures that we run on existing OSDs
diff --git a/src/pybind/mgr/cephadm/services/smb.py b/src/pybind/mgr/cephadm/services/smb.py
index dabc202a024..e322acb0e3e 100644
--- a/src/pybind/mgr/cephadm/services/smb.py
+++ b/src/pybind/mgr/cephadm/services/smb.py
@@ -1,6 +1,9 @@
+import errno
 import logging
 from typing import Any, Dict, List, Tuple, cast, Optional
 
+from mgr_module import HandleCommandResult
+
 from ceph.deployment.service_spec import ServiceSpec, SMBSpec
 
 from orchestrator import DaemonDescription
@@ -117,6 +120,23 @@ class SMBService(CephService):
             return True
         return False
 
+    def ok_to_stop(
+        self, daemon_ids: List[str], force: bool = False, known: Optional[List[str]] = None
+    ) -> HandleCommandResult:
+        # if only 1 smb, alert user (this is not passable with --force)
+        warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, "SMB", 1, True)
+        if warn:
+            return HandleCommandResult(-errno.EBUSY, "", warn_message)
+
+        # if reached here, there is > 1 smb daemon.
+        if force:
+            return HandleCommandResult(0, warn_message, "")
+
+        # if reached here, > 1 smb daemon and no force flag.
+        # Provide warning
+        warn_message = "WARNING: Removing SMB daemons can cause clients to lose connectivity. "
+        return HandleCommandResult(-errno.EBUSY, "", warn_message)
+
     def _allow_config_key_command(self, name: str) -> str:
         # permit the samba container config access to the mon config key store
         # with keys like smb/config/<cluster_id>/*.
diff --git a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2 b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
index 972ef22e7b5..c767baddbb7 100644
--- a/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
+++ b/src/pybind/mgr/cephadm/templates/services/grafana/grafana.ini.j2
@@ -15,7 +15,8 @@
   http_port = {{ http_port }}
   http_addr = {{ http_addr }}
 {% if mgmt_gw_enabled %}
-  root_url = %(protocol)s://%(domain)s/grafana/
+  root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+  serve_from_sub_path = true
 {% endif %}
 [snapshots]
   external_enabled = false
@@ -38,7 +39,7 @@
   header_property = username
   auto_sign_up = true
   sync_ttl = 15
-  whitelist = {{ mgmt_gw_ip }}
+  whitelist = {{ mgmt_gw_ips }}
   headers_encoded = false
   enable_login_token = false
   headers = Role:X-WEBAUTH-ROLE
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
index 260e7418e2d..50a61f843d1 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/external_server.conf.j2
@@ -46,9 +46,15 @@ server {
     # add_header Content-Security-Policy "default-src 'self'; script-src 'self'; object-src 'none'; base-uri 'none'; require-trusted-types-for 'script'; frame-ancestors 'self';";
 {% endif %}
 
-{% if oauth2_proxy_url %}
+{% if spec.enable_health_check_endpoint or spec.virtual_ip %}
+    location /health {
+         return 200 'OK';
+         add_header Content-Type text/plain;
+    }
+{% endif %}
+{% if enable_oauth2_proxy %}
     location /oauth2/ {
-        proxy_pass {{ oauth2_proxy_url }};
+        proxy_pass https://oauth2_proxy_servers;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header X-Scheme $scheme;
@@ -58,7 +64,7 @@ server {
 
     location = /oauth2/auth {
         internal;
-        proxy_pass {{ oauth2_proxy_url }};
+        proxy_pass https://oauth2_proxy_servers;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header X-Scheme $scheme;
@@ -72,7 +78,7 @@ server {
     location / {
         proxy_pass {{ dashboard_scheme }}://dashboard_servers;
         proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
@@ -109,12 +115,12 @@ server {
 
 {% if grafana_endpoints %}
     location /grafana {
-        rewrite ^/grafana/(.*) /$1 break;
         proxy_pass {{ grafana_scheme }}://grafana_servers;
         # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
         # will send this header if Grafana is running on the same node as one of those services
         proxy_set_header Authorization "";
-        {% if oauth2_proxy_url %}
+        proxy_buffering off;
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
@@ -150,7 +156,7 @@ server {
         proxy_ssl_trusted_certificate /etc/nginx/ssl/ca.crt;
         proxy_ssl_verify on;
         proxy_ssl_verify_depth 2;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
@@ -174,7 +180,7 @@ server {
         proxy_ssl_trusted_certificate /etc/nginx/ssl/ca.crt;
         proxy_ssl_verify on;
         proxy_ssl_verify_depth 2;
-        {% if oauth2_proxy_url %}
+        {% if enable_oauth2_proxy %}
         auth_request /oauth2/auth;
         error_page 401 = /oauth2/sign_in;
 
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
index f2c32f87977..2abb24b2eba 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/internal_server.conf.j2
@@ -1,5 +1,8 @@
 
 server {
+    ssl_client_certificate /etc/nginx/ssl/ca.crt;
+    ssl_verify_client on;
+
     listen              {{ internal_port }} ssl;
     listen              [::]:{{ internal_port }} ssl;
     ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -9,6 +12,20 @@ server {
     ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
     ssl_prefer_server_ciphers on;
 
+{% if spec.enable_health_check_endpoint or spec.virtual_ip %}
+    location /health {
+         return 200 'OK';
+         add_header Content-Type text/plain;
+    }
+{% endif %}
+{% if service_discovery_endpoints %}
+    location /internal/sd {
+        rewrite ^/internal/(.*) /$1 break;
+        proxy_pass https://service_discovery_servers;
+        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
+    }
+{% endif %}
+
 {% if dashboard_endpoints %}
     location /internal/dashboard {
         rewrite ^/internal/dashboard/(.*) /$1 break;
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
index 0c2a6b98c3b..b9773ceeeb3 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
@@ -8,6 +8,7 @@ events {
 
 http {
 
+    #access_log /dev/stdout;
     client_header_buffer_size 32K;
     large_client_header_buffers 4 32k;
     proxy_busy_buffers_size 512k;
@@ -16,6 +17,22 @@ http {
     proxy_headers_hash_max_size 1024;
     proxy_headers_hash_bucket_size 128;
 
+{% if oauth2_proxy_endpoints %}
+    upstream oauth2_proxy_servers {
+     {% for ep in oauth2_proxy_endpoints %}
+     server {{ ep }};
+     {% endfor %}
+    }
+{% endif %}
+
+{% if service_discovery_endpoints %}
+    upstream service_discovery_servers {
+     {% for ep in service_discovery_endpoints %}
+     server {{ ep }};
+     {% endfor %}
+    }
+{% endif %}
+
 {% if dashboard_endpoints %}
     upstream dashboard_servers {
      {% for ep in dashboard_endpoints %}
diff --git a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
index ded403169c9..03ff8a32ca2 100644
--- a/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nfs/ganesha.conf.j2
@@ -4,6 +4,7 @@ NFS_CORE_PARAM {
         Enable_RQUOTA = false;
         Protocols = 4;
         NFS_Port = {{ port }};
+        allow_set_io_flusher_fail = true;
 {% if bind_addr %}
         Bind_addr = {{ bind_addr }};
 {% endif %}
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index dbe29004771..760bc97e515 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -20,6 +20,8 @@ allowed_consecutive_spdk_ping_failures = {{ spec.allowed_consecutive_spdk_ping_f
 spdk_ping_interval_in_seconds = {{ spec.spdk_ping_interval_in_seconds }}
 ping_spdk_under_lock = {{ spec.ping_spdk_under_lock }}
 enable_monitor_client = {{ spec.enable_monitor_client }}
+max_hosts_per_namespace = {{ spec.max_hosts_per_namespace }}
+max_namespaces_with_netmask = {{ spec.max_namespaces_with_netmask }}
 
 [gateway-logs]
 log_level = {{ spec.log_level }}
@@ -53,7 +55,15 @@ rpc_socket_dir = {{ spec.rpc_socket_dir }}
 rpc_socket_name = {{ spec.rpc_socket_name }}
 timeout = {{ spec.spdk_timeout }}
 bdevs_per_cluster = {{ spec.bdevs_per_cluster }}
+{% if spec.spdk_log_level %}
 log_level = {{ spec.spdk_log_level }}
+{% endif %}
+{% if spec.spdk_protocol_log_level %}
+protocol_log_level = {{ spec.spdk_protocol_log_level }}
+{% endif %}
+{% if spec.spdk_log_file_dir %}
+log_file_dir = {{ spec.spdk_log_file_dir }}
+{% endif %}
 conn_retries = {{ spec.conn_retries }}
 transports = {{ spec.transports }}
 {% if transport_tcp_options %}
@@ -65,4 +75,7 @@ tgt_cmd_extra_args = {{ spec.tgt_cmd_extra_args }}
 
 [monitor]
 timeout = {{ spec.monitor_timeout }}
+{% if spec.monitor_client_log_file_dir %}
+log_file_dir = {{ spec.monitor_client_log_file_dir }}
+{% endif %}
 
diff --git a/src/pybind/mgr/cephadm/tests/ceph_volume_data.py b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py
new file mode 100644
index 00000000000..afd6d89d39e
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/ceph_volume_data.py
@@ -0,0 +1 @@
+data = '{"0":[{"devices":["/dev/vdb"],"lv_name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","name":"osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","path":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668"},{"devices":["/dev/vdk"],"lv_name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.block_uuid=d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c,ceph.db_uuid=EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX,ceph.encrypted=0,ceph.osd_fsid=8cd7fa43-ef40-49e7-abb2-db5cfd91bc92,ceph.osd_id=0,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","name":"osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","tags":{"ceph.block_device":"/dev/ceph-81c76363-7a89-47d2-83c1-fdcbab5d6668/osd-block-8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.block_uuid":"d518Lz-gTnC-FyX7-4MN2-icIp-LBCB-zdQw2p","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-f0f5e20c-f1ee-42df-9a78-0e70b9c08e6c","ceph.db_uuid":"EInXUQ-LDDO-7jCL-Y0Jb-tPZ2-KuKl-VNJ2hX","ceph.encrypted":"0","ceph.osd_fsid":"8cd7fa43-ef40-49e7-abb2-db5cfd91bc92","ceph.osd_id":"0","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"1":[{"devices":["/dev/vdc"],"lv_name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","name":"osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","path":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb"},{"devices":["/dev/vdk"],"lv_name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.block_uuid=Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774,ceph.db_uuid=1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC,ceph.encrypted=0,ceph.osd_fsid=aaa4c8cb-2b54-4df8-9846-17063c59b6ce,ceph.osd_id=1,ceph.osdspec_affinity=osd.shared_db,ceph.type=db,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","name":"osd-db-38f53373-7575-4c90-98ca-28f189685774","path":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","tags":{"ceph.block_device":"/dev/ceph-964cfc71-ad91-4189-97c1-cab4fd3066bb/osd-block-aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.block_uuid":"Ccvedr-7t3C-BgIg-lfSl-qW3J-Zw1V-FuH14l","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf/osd-db-38f53373-7575-4c90-98ca-28f189685774","ceph.db_uuid":"1mEAHd-mxQn-Qr9c-DkD8-XGOQ-xfIN-ZsPReC","ceph.encrypted":"0","ceph.osd_fsid":"aaa4c8cb-2b54-4df8-9846-17063c59b6ce","ceph.osd_id":"1","ceph.osdspec_affinity":"osd.shared_db","ceph.type":"db","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-e10d6a69-68ec-44ba-bd3b-9a20d15cacbf"}],"2":[{"devices":["/dev/vdf"],"lv_name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.block_uuid=adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=a0434b49-759a-46a4-91dc-d7cc65af3a33,ceph.osd_id=2,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","name":"osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","path":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","tags":{"ceph.block_device":"/dev/ceph-3ba7a728-709b-408c-a043-9e48704b5ffb/osd-block-a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.block_uuid":"adQsil-KScK-5QkX-bLbg-EpJa-sNJL-3oDtaO","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"a0434b49-759a-46a4-91dc-d7cc65af3a33","ceph.osd_id":"2","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-3ba7a728-709b-408c-a043-9e48704b5ffb"}],"3":[{"devices":["/dev/vde"],"lv_name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.block_uuid=GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=861ea81a-c24b-4c69-b4f6-e527151b132f,ceph.osd_id=3,ceph.osdspec_affinity=None,ceph.type=block,ceph.vdo=0,ceph.with_tpm=0","lv_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","name":"osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","path":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","tags":{"ceph.block_device":"/dev/ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b/osd-block-861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.block_uuid":"GBfm14-4hPu-oaWk-wSdA-O1Fw-eU5o-Q2KOh8","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"861ea81a-c24b-4c69-b4f6-e527151b132f","ceph.osd_id":"3","ceph.osdspec_affinity":"None","ceph.type":"block","ceph.vdo":"0","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-97ac74d9-d351-4a7e-bbd1-27b8dd3e7f7b"}],"4":[{"devices":["/dev/vdg"],"lv_name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","name":"osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","path":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-20acdce8-5548-4707-a38e-b8e925485bc5"},{"devices":["/dev/vdj"],"lv_name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","name":"osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdi"],"lv_name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.block_uuid=diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16,ceph.db_uuid=5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx,ceph.encrypted=0,ceph.osd_fsid=242c4a21-b076-424c-94fb-3f556ed2ddbd,ceph.osd_id=4,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008,ceph.wal_uuid=ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2,ceph.with_tpm=0","lv_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","name":"osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","tags":{"ceph.block_device":"/dev/ceph-20acdce8-5548-4707-a38e-b8e925485bc5/osd-block-242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.block_uuid":"diO6OQ-jjkD-tdVS-FJ5f-VcP7-8QEW-geP4Ds","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-19fc3a21-ce53-4881-9217-f1d58166af16","ceph.db_uuid":"5mng9E-Q3ej-37eY-Ny9C-p6wf-h17w-gC3jtx","ceph.encrypted":"0","ceph.osd_fsid":"242c4a21-b076-424c-94fb-3f556ed2ddbd","ceph.osd_id":"4","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-2542dafe-2ff7-4e8b-bc70-a0297b421008","ceph.wal_uuid":"ppb82k-9cEs-yb1K-QTNl-c4BM-33PQ-bNX0c2","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}],"5":[{"devices":["/dev/vdj"],"lv_name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=wal,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","name":"osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","path":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"wal","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"wal","vg_name":"ceph-776f980b-152a-4e8f-99b6-bae27ed0b528"},{"devices":["/dev/vdh"],"lv_name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","lv_size":"214744170496","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=block,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","name":"osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","path":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"block","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"block","vg_name":"ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351"},{"devices":["/dev/vdi"],"lv_name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","lv_size":"107369988096","lv_tags":"ceph.block_device=/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.block_uuid=gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E,ceph.cephx_lockbox_secret=,ceph.cluster_fsid=83231340-7cd4-11ef-ab48-525400e54507,ceph.cluster_name=ceph,ceph.crush_device_class=,ceph.db_device=/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb,ceph.db_uuid=wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2,ceph.encrypted=0,ceph.osd_fsid=8cf28853-3453-49b0-a3f9-a693443ed75f,ceph.osd_id=5,ceph.osdspec_affinity=osd.shared_db_wal,ceph.type=db,ceph.vdo=0,ceph.wal_device=/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea,ceph.wal_uuid=DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz,ceph.with_tpm=0","lv_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","name":"osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","path":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","tags":{"ceph.block_device":"/dev/ceph-84a4ccfc-80f1-4784-9558-a9a08b15a351/osd-block-8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.block_uuid":"gmQkh2-T5i3-Kwfa-YMMO-j88X-RvDw-dx7N6E","ceph.cephx_lockbox_secret":"","ceph.cluster_fsid":"83231340-7cd4-11ef-ab48-525400e54507","ceph.cluster_name":"ceph","ceph.crush_device_class":"","ceph.db_device":"/dev/ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452/osd-db-635f592b-1d4f-4117-aaa6-b68878f84dfb","ceph.db_uuid":"wf407q-HwuD-OWhh-xm2A-d2sv-Fdsx-JqeUj2","ceph.encrypted":"0","ceph.osd_fsid":"8cf28853-3453-49b0-a3f9-a693443ed75f","ceph.osd_id":"5","ceph.osdspec_affinity":"osd.shared_db_wal","ceph.type":"db","ceph.vdo":"0","ceph.wal_device":"/dev/ceph-776f980b-152a-4e8f-99b6-bae27ed0b528/osd-wal-90739e2d-ec18-4761-8290-1ad508ecbeea","ceph.wal_uuid":"DFQDJy-6bE0-iagr-hgmh-oUEH-HF2R-ILBzzz","ceph.with_tpm":"0"},"type":"db","vg_name":"ceph-8da158be-4d0d-41bd-86ef-d75dbfc71452"}]}'
diff --git a/src/pybind/mgr/cephadm/tests/conftest.py b/src/pybind/mgr/cephadm/tests/conftest.py
index e8add2c7b83..5cc2fabaf49 100644
--- a/src/pybind/mgr/cephadm/tests/conftest.py
+++ b/src/pybind/mgr/cephadm/tests/conftest.py
@@ -1,13 +1,14 @@
 import pytest
 
 from cephadm.services.osd import RemoveUtil, OSD
-from tests import mock
-
+from mock import mock
 from .fixtures import with_cephadm_module
+from cephadm import CephadmOrchestrator
+from typing import Generator
 
 
 @pytest.fixture()
-def cephadm_module():
+def cephadm_module() -> Generator[CephadmOrchestrator, None, None]:
     with with_cephadm_module({}) as m:
         yield m
 
diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py
index dd858c6c7da..dda0c6720ac 100644
--- a/src/pybind/mgr/cephadm/tests/fixtures.py
+++ b/src/pybind/mgr/cephadm/tests/fixtures.py
@@ -35,11 +35,11 @@ def get_module_option_ex(_, module, key, default=None):
     return None
 
 
-def _run_cephadm(ret):
+def _run_cephadm(ret, rc: int = 0):
     async def foo(s, host, entity, cmd, e, **kwargs):
         if cmd == 'gather-facts':
             return '{}', '', 0
-        return [ret], '', 0
+        return [ret], '', rc
     return foo
 
 
diff --git a/src/pybind/mgr/cephadm/tests/test_ceph_volume.py b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py
new file mode 100644
index 00000000000..cc1378a7575
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_ceph_volume.py
@@ -0,0 +1,231 @@
+import json
+import pytest
+from .ceph_volume_data import data
+from cephadm.serve import CephadmServe
+from cephadm import CephadmOrchestrator
+from mock import patch
+from .fixtures import _run_cephadm, with_host
+
+
+class TestCephVolume:
+    def test_run(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)):
+                    c = cephadm_module.ceph_volume.run('test', ['/bin/foo'])
+                assert c == (['fake-output'], '', 0)
+
+    def test_run_json(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('{"this-is-a-fake-key": "this-is-a-fake-value"}', 0)):
+                    c = cephadm_module.ceph_volume.run_json('test', ['/bin/foo'])
+                assert c == {"this-is-a-fake-key": "this-is-a-fake-value"}
+
+    def test_clear_replace_header_ok(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('fake-output', 0)):
+                    c = cephadm_module.ceph_volume.clear_replace_header('test', '/dev/foo')
+                assert c == 'Replacement header cleared on /dev/foo'
+
+    def test_clear_replace_header_nok(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm('', 1)):
+                    c = cephadm_module.ceph_volume.clear_replace_header('fake-output', '/dev/foo')
+                assert c.strip() == 'No replacement header could be cleared on /dev/foo.'
+
+
+class TestCephVolumeList:
+    def test_get_data(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.data == json.loads(data)
+
+    def test_devices_by_type_block(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('block')) == set(['/dev/vdb',
+                                                                                                     '/dev/vdc',
+                                                                                                     '/dev/vdg',
+                                                                                                     '/dev/vde',
+                                                                                                     '/dev/vdf',
+                                                                                                     '/dev/vdh'])
+
+    def test_devices_by_type_db(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.devices_by_type('db')) == set(['/dev/vdi',
+                                                                                                  '/dev/vdk'])
+
+    def test_devices_by_type_wal(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.devices_by_type('wal') == ['/dev/vdj']
+
+    def test_block_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.block_devices()) == set(['/dev/vdb',
+                                                                                            '/dev/vdc',
+                                                                                            '/dev/vdg',
+                                                                                            '/dev/vde',
+                                                                                            '/dev/vdf',
+                                                                                            '/dev/vdh'])
+
+    def test_db_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.db_devices()) == set(['/dev/vdk',
+                                                                                         '/dev/vdi'])
+
+    def test_wal_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.wal_devices()) == set(['/dev/vdj'])
+
+    def test_all_devices(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.all_devices()) == set(['/dev/vdg',
+                                                                                          '/dev/vdj',
+                                                                                          '/dev/vdh',
+                                                                                          '/dev/vdi',
+                                                                                          '/dev/vdc',
+                                                                                          '/dev/vde',
+                                                                                          '/dev/vdf',
+                                                                                          '/dev/vdb',
+                                                                                          '/dev/vdk'])
+
+    def test_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']},
+                                                                                        '/dev/vdk': {'osd_ids': ['0', '1']},
+                                                                                        '/dev/vdc': {'osd_ids': ['1']},
+                                                                                        '/dev/vdf': {'osd_ids': ['2']},
+                                                                                        '/dev/vde': {'osd_ids': ['3']},
+                                                                                        '/dev/vdg': {'osd_ids': ['4']},
+                                                                                        '/dev/vdj': {'osd_ids': ['4', '5']},
+                                                                                        '/dev/vdi': {'osd_ids': ['4', '5']},
+                                                                                        '/dev/vdh': {'osd_ids': ['5']}}
+
+    def test_block_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.block_device_osd_mapping() == {'/dev/vdb': {'osd_ids': ['0']},
+                                                                                              '/dev/vdc': {'osd_ids': ['1']},
+                                                                                              '/dev/vdf': {'osd_ids': ['2']},
+                                                                                              '/dev/vde': {'osd_ids': ['3']},
+                                                                                              '/dev/vdg': {'osd_ids': ['4']},
+                                                                                              '/dev/vdh': {'osd_ids': ['5']}}
+
+    def test_db_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.db_device_osd_mapping() == {'/dev/vdk': {'osd_ids': ['0', '1']},
+                                                                                           '/dev/vdi': {'osd_ids': ['4', '5']}}
+
+    def test_wal_device_osd_mapping(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.wal_device_osd_mapping() == {'/dev/vdj': {'osd_ids': ['4', '5']}}
+
+    def test_is_shared_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/vdj')
+
+    def test_is_shared_device_with_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    with pytest.raises(RuntimeError) as e:
+                        assert cephadm_module.ceph_volume.lvm_list.is_shared_device('/dev/invalid-device')
+                    assert str(e.value) == 'Not a valid device path.'
+
+    def test_is_block_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_block_device('/dev/vdb')
+
+    def test_is_db_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_db_device('/dev/vdk')
+
+    def test_is_wal_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.is_wal_device('/dev/vdj')
+
+    def test_get_block_devices_from_osd_id(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert cephadm_module.ceph_volume.lvm_list.get_block_devices_from_osd_id('0') == ['/dev/vdb']
+
+    def test_osd_ids(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    cephadm_module.ceph_volume.lvm_list.get_data('test')
+                    assert set(cephadm_module.ceph_volume.lvm_list.osd_ids()) == set(['0', '1', '2', '3', '4', '5'])
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index 5a485f98be3..975c125225d 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -2040,7 +2040,7 @@ class TestCephadm(object):
             ), CephadmOrchestrator.apply_iscsi),
             (CustomContainerSpec(
                 service_id='hello-world',
-                image='docker.io/library/hello-world:latest',
+                image='quay.io/hello-world/hello-world:latest',
                 uid=65534,
                 gid=65534,
                 dirs=['foo/bar'],
diff --git a/src/pybind/mgr/cephadm/tests/test_replace_device.py b/src/pybind/mgr/cephadm/tests/test_replace_device.py
new file mode 100644
index 00000000000..b4a2c81ad9a
--- /dev/null
+++ b/src/pybind/mgr/cephadm/tests/test_replace_device.py
@@ -0,0 +1,53 @@
+import pytest
+from mock import patch
+from .fixtures import _run_cephadm, with_host, wait
+from .ceph_volume_data import data
+from cephadm.serve import CephadmServe
+from cephadm import CephadmOrchestrator
+from orchestrator import OrchestratorError
+
+
+class TestReplaceDevice:
+    def test_invalid_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError) as e:
+                        cephadm_module.replace_device('test', '/dev/invalid-device')
+                    assert "/dev/invalid-device doesn't appear to be used for an OSD, not a valid device in test." in str(e.value)
+
+    def test_invalid_hostname(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError):
+                        cephadm_module.replace_device('invalid-hostname', '/dev/vdb')
+
+    def test_block_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    c = cephadm_module.replace_device('test', '/dev/vdb')
+                    result = wait(cephadm_module, c)
+                    assert result == "Scheduled to destroy osds: ['0'] and mark /dev/vdb as being replaced."
+
+    def test_shared_db_device_no_ireallymeanit_flag(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    with pytest.raises(OrchestratorError) as e:
+                        cephadm_module.replace_device('test', '/dev/vdk')
+                    assert "/dev/vdk is a shared device.\nReplacing /dev/vdk implies destroying OSD(s): ['0', '1'].\nPlease, *be very careful*, this can be a very dangerous operation.\nIf you know what you are doing, pass --yes-i-really-mean-it" in str(e.value)
+
+    def test_shared_db_device(self, cephadm_module: CephadmOrchestrator) -> None:
+        with patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')):
+            with with_host(cephadm_module, 'test'):
+                CephadmServe(cephadm_module)._refresh_host_daemons('test')
+                with patch('cephadm.serve.CephadmServe._run_cephadm', _run_cephadm(data)):
+                    c = cephadm_module.replace_device('test', '/dev/vdk', yes_i_really_mean_it=True)
+                    result = wait(cephadm_module, c)
+                    assert result == "Scheduled to destroy osds: ['0', '1'] and mark /dev/vdk as being replaced."
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index a9b7da624a0..84d7c8f5b13 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -49,9 +49,9 @@ from typing import Dict, List
 
 cephadm_root_ca = """-----BEGIN CERTIFICATE-----\\nMIIE7DCCAtSgAwIBAgIUE8b2zZ64geu2ns3Zfn3/4L+Cf6MwDQYJKoZIhvcNAQEL\\nBQAwFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MB4XDTI0MDYyNjE0NDA1M1oXDTM0\\nMDYyNzE0NDA1M1owFzEVMBMGA1UEAwwMY2VwaGFkbS1yb290MIICIjANBgkqhkiG\\n9w0BAQEFAAOCAg8AMIICCgKCAgEAsZRJsdtTr9GLG1lWFql5SGc46ldFanNJd1Gl\\nqXq5vgZVKRDTmNgAb/XFuNEEmbDAXYIRZolZeYKMHfn0pouPRSel0OsC6/02ZUOW\\nIuN89Wgo3IYleCFpkVIumD8URP3hwdu85plRxYZTtlruBaTRH38lssyCqxaOdEt7\\nAUhvYhcMPJThB17eOSQ73mb8JEC83vB47fosI7IhZuvXvRSuZwUW30rJanWNhyZq\\neS2B8qw2RSO0+77H6gA4ftBnitfsE1Y8/F9Z/f92JOZuSMQXUB07msznPbRJia3f\\nueO8gOc32vxd1A1/Qzp14uX34yEGY9ko2lW226cZO29IVUtXOX+LueQttwtdlpz8\\ne6Npm09pXhXAHxV/OW3M28MdXmobIqT/m9MfkeAErt5guUeC5y8doz6/3VQRjFEn\\nRpN0WkblgnNAQ3DONPc+Qd9Fi/wZV2X7bXoYpNdoWDsEOiE/eLmhG1A2GqU/mneP\\nzQ6u79nbdwTYpwqHpa+PvusXeLfKauzI8lLUJotdXy9EK8iHUofibB61OljYye6B\\nG3b8C4QfGsw8cDb4APZd/6AZYyMx/V3cGZ+GcOV7WvsC8k7yx5Uqasm/kiGQ3EZo\\nuNenNEYoGYrjb8D/8QzqNUTwlEh27/ps80tO7l2GGTvWVZL0PRZbmLDvO77amtOf\\nOiRXMoUCAwEAAaMwMC4wGwYDVR0RBBQwEocQAAAAAAAAAAAAAAAAAAAAATAPBgNV\\nHRMBAf8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4ICAQAxwzX5AhYEWhTV4VUwUj5+\\nqPdl4Q2tIxRokqyE+cDxoSd+6JfGUefUbNyBxDt0HaBq8obDqqrbcytxnn7mpnDu\\nhtiauY+I4Amt7hqFOiFA4cCLi2mfok6g2vL53tvhd9IrsfflAU2wy7hL76Ejm5El\\nA+nXlkJwps01Whl9pBkUvIbOn3pXX50LT4hb5zN0PSu957rjd2xb4HdfuySm6nW4\\n4GxtVWfmGA6zbC4XMEwvkuhZ7kD2qjkAguGDF01uMglkrkCJT3OROlNBuSTSBGqt\\ntntp5VytHvb7KTF7GttM3ha8/EU2KYaHM6WImQQTrOfiImAktOk4B3lzUZX3HYIx\\n+sByO4P4dCvAoGz1nlWYB2AvCOGbKf0Tgrh4t4jkiF8FHTXGdfvWmjgi1pddCNAy\\nn65WOCmVmLZPERAHOk1oBwqyReSvgoCFo8FxbZcNxJdlhM0Z6hzKggm3O3Dl88Xl\\n5euqJjh2STkBW8Xuowkg1TOs5XyWvKoDFAUzyzeLOL8YSG+gXV22gPTUaPSVAqdb\\nwd0Fx2kjConuC5bgTzQHs8XWA930U3XWZraj21Vaa8UxlBLH4fUro8H5lMSYlZNE\\nJHRNW8BkznAClaFSDG3dybLsrzrBFAu/Qb5zVkT1xyq0YkepGB7leXwq6vjWA5Pw\\nmZbKSphWfh0qipoqxqhfkw==\\n-----END CERTIFICATE-----\\n"""
 
-ceph_generated_cert = """-----BEGIN CERTIFICATE-----\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\n-----END CERTIFICATE-----\n"""
+ceph_generated_cert = """-----BEGIN CERTIFICATE-----\\nMIICxjCCAa4CEQDIZSujNBlKaLJzmvntjukjMA0GCSqGSIb3DQEBDQUAMCExDTAL\\nBgNVBAoMBENlcGgxEDAOBgNVBAMMB2NlcGhhZG0wHhcNMjIwNzEzMTE0NzA3WhcN\\nMzIwNzEwMTE0NzA3WjAhMQ0wCwYDVQQKDARDZXBoMRAwDgYDVQQDDAdjZXBoYWRt\\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAyyMe4DMA+MeYK7BHZMHB\\nq7zjliEOcNgxomjU8qbf5USF7Mqrf6+/87XWqj4pCyAW8x0WXEr6A56a+cmBVmt+\\nqtWDzl020aoId6lL5EgLLn6/kMDCCJLq++Lg9cEofMSvcZh+lY2f+1p+C+00xent\\nrLXvXGOilAZWaQfojT2BpRnNWWIFbpFwlcKrlg2G0cFjV5c1m6a0wpsQ9JHOieq0\\nSvwCixajwq3CwAYuuiU1wjI4oJO4Io1+g8yB3nH2Mo/25SApCxMXuXh4kHLQr/T4\\n4hqisvG4uJYgKMcSIrWj5o25mclByGi1UI/kZkCUES94i7Z/3ihx4Bad0AMs/9tw\\nFwIDAQABMA0GCSqGSIb3DQEBDQUAA4IBAQAf+pwz7Gd7mDwU2LY0TQXsK6/8KGzh\\nHuX+ErOb8h5cOAbvCnHjyJFWf6gCITG98k9nxU9NToG0WYuNm/max1y/54f0dtxZ\\npUo6KSNl3w6iYCfGOeUIj8isi06xMmeTgMNzv8DYhDt+P2igN6LenqWTVztogkiV\\nxQ5ZJFFLEw4sN0CXnrZX3t5ruakxLXLTLKeE0I91YJvjClSBGkVJq26wOKQNHMhx\\npWxeydQ5EgPZY+Aviz5Dnxe8aB7oSSovpXByzxURSabOuCK21awW5WJCGNpmqhWK\\nZzACBDEstccj57c4OGV0eayHJRsluVr2e9NHRINZA3qdB37e6gsI1xHo\\n-----END CERTIFICATE-----\\n"""
 
-ceph_generated_key = """-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\n39gnaegswnz9KMQAvzKFdg==\n-----END PRIVATE KEY-----\n"""
+ceph_generated_key = """-----BEGIN PRIVATE KEY-----\\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDLIx7gMwD4x5gr\\nsEdkwcGrvOOWIQ5w2DGiaNTypt/lRIXsyqt/r7/ztdaqPikLIBbzHRZcSvoDnpr5\\nyYFWa36q1YPOXTbRqgh3qUvkSAsufr+QwMIIkur74uD1wSh8xK9xmH6VjZ/7Wn4L\\n7TTF6e2ste9cY6KUBlZpB+iNPYGlGc1ZYgVukXCVwquWDYbRwWNXlzWbprTCmxD0\\nkc6J6rRK/AKLFqPCrcLABi66JTXCMjigk7gijX6DzIHecfYyj/blICkLExe5eHiQ\\nctCv9PjiGqKy8bi4liAoxxIitaPmjbmZyUHIaLVQj+RmQJQRL3iLtn/eKHHgFp3Q\\nAyz/23AXAgMBAAECggEAVoTB3Mm8azlPlaQB9GcV3tiXslSn+uYJ1duCf0sV52dV\\nBzKW8s5fGiTjpiTNhGCJhchowqxoaew+o47wmGc2TvqbpeRLuecKrjScD0GkCYyQ\\neM2wlshEbz4FhIZdgS6gbuh9WaM1dW/oaZoBNR5aTYo7xYTmNNeyLA/jO2zr7+4W\\n5yES1lMSBXpKk7bDGKYY4bsX2b5RLr2Grh2u2bp7hoLABCEvuu8tSQdWXLEXWpXo\\njwmV3hc6tabypIa0mj2Dmn2Dmt1ppSO0AZWG/WAizN3f4Z0r/u9HnbVrVmh0IEDw\\n3uf2LP5o3msG9qKCbzv3lMgt9mMr70HOKnJ8ohMSKQKBgQDLkNb+0nr152HU9AeJ\\nvdz8BeMxcwxCG77iwZphZ1HprmYKvvXgedqWtS6FRU+nV6UuQoPUbQxJBQzrN1Qv\\nwKSlOAPCrTJgNgF/RbfxZTrIgCPuK2KM8I89VZv92TSGi362oQA4MazXC8RAWjoJ\\nSu1/PHzK3aXOfVNSLrOWvIYeZQKBgQD/dgT6RUXKg0UhmXj7ExevV+c7oOJTDlMl\\nvLngrmbjRgPO9VxLnZQGdyaBJeRngU/UXfNgajT/MU8B5fSKInnTMawv/tW7634B\\nw3v6n5kNIMIjJmENRsXBVMllDTkT9S7ApV+VoGnXRccbTiDapBThSGd0wri/CuwK\\nNWK1YFOeywKBgEDyI/XG114PBUJ43NLQVWm+wx5qszWAPqV/2S5MVXD1qC6zgCSv\\nG9NLWN1CIMimCNg6dm7Wn73IM7fzvhNCJgVkWqbItTLG6DFf3/DPODLx1wTMqLOI\\nqFqMLqmNm9l1Nec0dKp5BsjRQzq4zp1aX21hsfrTPmwjxeqJZdioqy2VAoGAXR5X\\nCCdSHlSlUW8RE2xNOOQw7KJjfWT+WAYoN0c7R+MQplL31rRU7dpm1bLLRBN11vJ8\\nMYvlT5RYuVdqQSP6BkrX+hLJNBvOLbRlL+EXOBrVyVxHCkDe+u7+DnC4epbn+N8P\\nLYpwqkDMKB7diPVAizIKTBxinXjMu5fkKDs5n+sCgYBbZheYKk5M0sIxiDfZuXGB\\nkf4mJdEkTI1KUGRdCwO/O7hXbroGoUVJTwqBLi1tKqLLarwCITje2T200BYOzj82\\nqwRkCXGtXPKnxYEEUOiFx9OeDrzsZV00cxsEnX0Zdj+PucQ/J3Cvd0dWUspJfLHJ\\n39gnaegswnz9KMQAvzKFdg==\\n-----END PRIVATE KEY-----\\n"""
 
 
 class FakeInventory:
@@ -409,6 +409,8 @@ allowed_consecutive_spdk_ping_failures = 1
 spdk_ping_interval_in_seconds = 2.0
 ping_spdk_under_lock = False
 enable_monitor_client = True
+max_hosts_per_namespace = 1
+max_namespaces_with_netmask = 1000
 
 [gateway-logs]
 log_level = INFO
@@ -442,7 +444,7 @@ rpc_socket_dir = /var/tmp/
 rpc_socket_name = spdk.sock
 timeout = 60.0
 bdevs_per_cluster = 32
-log_level = WARNING
+protocol_log_level = WARNING
 conn_retries = 10
 transports = tcp
 transport_tcp_options = {{"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7}}
@@ -608,6 +610,101 @@ class TestMonitoring:
     @patch("cephadm.services.monitoring.password_hash", lambda password: 'alertmanager_password_hash')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: 'cephadm_root_cert')
     @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: ('mycert', 'mykey'))
+    def test_alertmanager_config_when_mgmt_gw_enabled(self, _get_fqdn, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
+
+        fqdn = 'host1.test'
+        _get_fqdn.return_value = fqdn
+
+        with with_host(cephadm_module, 'test'):
+            cephadm_module.secure_monitoring_stack = True
+            cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user')
+            cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password')
+            with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, AlertManagerSpec()):
+
+                y = dedent("""
+                # This file is generated by cephadm.
+                # See https://prometheus.io/docs/alerting/configuration/ for documentation.
+
+                global:
+                  resolve_timeout: 5m
+                  http_config:
+                    tls_config:
+                      ca_file: root_cert.pem
+
+                route:
+                  receiver: 'default'
+                  routes:
+                    - group_by: ['alertname']
+                      group_wait: 10s
+                      group_interval: 10s
+                      repeat_interval: 1h
+                      receiver: 'ceph-dashboard'
+
+                receivers:
+                - name: 'default'
+                  webhook_configs:
+                - name: 'ceph-dashboard'
+                  webhook_configs:
+                  - url: 'https://host_fqdn:29443/internal/dashboard/api/prometheus_receiver'
+                """).lstrip()
+
+                web_config = dedent("""
+                tls_server_config:
+                  cert_file: alertmanager.crt
+                  key_file: alertmanager.key
+                  client_auth_type: RequireAndVerifyClientCert
+                  client_ca_file: root_cert.pem
+                basic_auth_users:
+                    alertmanager_user: alertmanager_password_hash
+                """).lstrip()
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "alertmanager.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'alertmanager.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9093, 9094],
+                        },
+                        "meta": {
+                            'service_name': 'alertmanager',
+                            'ports': [9093, 9094],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": {
+                                "alertmanager.yml": y,
+                                'alertmanager.crt': 'mycert',
+                                'alertmanager.key': 'mykey',
+                                'web.yml': web_config,
+                                'root_cert.pem': 'cephadm_root_cert'
+                            },
+                            'peers': [],
+                            'web_config': '/etc/alertmanager/web.yml',
+                            "use_url_prefix": True,
+                        }
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("socket.getfqdn")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
+    @patch("cephadm.services.monitoring.password_hash", lambda password: 'alertmanager_password_hash')
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: 'cephadm_root_cert')
+    @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: ('mycert', 'mykey'))
     def test_alertmanager_config_security_enabled(self, _get_fqdn, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
 
@@ -739,6 +836,110 @@ class TestMonitoring:
                                                 use_current_daemon_image=False)
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("mgr_module.MgrModule.get")
+    @patch("socket.getfqdn")
+    def test_node_exporter_config_without_mgmt_gw(
+        self,
+        mock_getfqdn,
+        mock_get,
+        _run_cephadm,
+        cephadm_module: CephadmOrchestrator,
+    ):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+        fqdn = 'host1.test'
+        mock_getfqdn.return_value = fqdn
+
+        with with_host(cephadm_module, "test"):
+            with with_service(cephadm_module, MonitoringSpec('node-exporter')):
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "node-exporter.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'node-exporter.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9100],
+                        },
+                        "meta": {
+                            'service_name': 'node-exporter',
+                            'ports': [9100],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {}
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch('cephadm.cert_mgr.CertMgr.generate_cert', lambda instance, fqdn, ip: (ceph_generated_cert, ceph_generated_key))
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("socket.getfqdn")
+    def test_node_exporter_config_with_mgmt_gw(
+        self,
+        mock_getfqdn,
+        _run_cephadm,
+        cephadm_module: CephadmOrchestrator,
+    ):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+        mock_getfqdn.return_value = 'host1.test'
+
+        y = dedent("""
+        tls_server_config:
+          cert_file: node_exporter.crt
+          key_file: node_exporter.key
+          client_auth_type: RequireAndVerifyClientCert
+          client_ca_file: root_cert.pem
+        """).lstrip()
+
+        with with_host(cephadm_module, "test"):
+            with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, MonitoringSpec('node-exporter')):
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "node-exporter.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'node-exporter.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [9100],
+                        },
+                        "meta": {
+                            'service_name': 'node-exporter',
+                            'ports': [9100],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": {
+                                "web.yml": y,
+                                'root_cert.pem': f"{cephadm_root_ca}",
+                                'node_exporter.crt': f"{ceph_generated_cert}",
+                                'node_exporter.key': f"{ceph_generated_key}",
+                            },
+                            'web_config': '/etc/node-exporter/web.yml',
+                        }
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     def test_prometheus_config_security_disabled(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
@@ -1244,6 +1445,286 @@ class TestMonitoring:
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
     @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
     @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    def test_grafana_config_with_mgmt_gw_and_ouath2_proxy(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+
+        y = dedent(f"""
+             # This file is generated by cephadm.
+             apiVersion: 1
+
+             deleteDatasources:
+               - name: 'Dashboard1'
+                 orgId: 1
+
+             datasources:
+               - name: 'Dashboard1'
+                 type: 'prometheus'
+                 access: 'proxy'
+                 orgId: 1
+                 url: 'https://host_fqdn:29443/internal/prometheus'
+                 basicAuth: true
+                 isDefault: true
+                 editable: false
+                 basicAuthUser: admin
+                 jsonData:
+                    graphiteVersion: "1.1"
+                    tlsAuth: false
+                    tlsAuthWithCACert: true
+                    tlsSkipVerify: false
+                 secureJsonData:
+                   basicAuthPassword: admin
+                   tlsCACert: "{cephadm_root_ca}"
+                   tlsClientCert: "{ceph_generated_cert}"
+                   tlsClientKey: "{ceph_generated_key}"
+
+               - name: 'Loki'
+                 type: 'loki'
+                 access: 'proxy'
+                 url: ''
+                 basicAuth: false
+                 isDefault: false
+                 editable: false""").lstrip()
+
+        oauth2_spec = OAuth2ProxySpec(provider_display_name='my_idp_provider',
+                                      client_id='my_client_id',
+                                      client_secret='my_client_secret',
+                                      oidc_issuer_url='http://192.168.10.10:8888/dex',
+                                      cookie_secret='kbAEM9opAmuHskQvt0AW8oeJRaOM2BYy5Loba0kZ0SQ=',
+                                      ssl_certificate=ceph_generated_cert,
+                                      ssl_certificate_key=ceph_generated_key)
+
+        with with_host(cephadm_module, "test"):
+            cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test')
+            cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test')
+            with with_service(cephadm_module, PrometheusSpec("prometheus")) as _, \
+                 with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                 with_service(cephadm_module, oauth2_spec) as _, \
+                 with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
+                     cephadm_module, GrafanaSpec("grafana")
+            ) as _:
+                files = {
+                    'grafana.ini': dedent("""
+                        # This file is generated by cephadm.
+                        [users]
+                          default_theme = light
+                        [auth.anonymous]
+                          enabled = true
+                          org_name = 'Main Org.'
+                          org_role = 'Viewer'
+                        [server]
+                          domain = 'host_fqdn'
+                          protocol = https
+                          cert_file = /etc/grafana/certs/cert_file
+                          cert_key = /etc/grafana/certs/cert_key
+                          http_port = 3000
+                          http_addr = 
+                          root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+                          serve_from_sub_path = true
+                        [snapshots]
+                          external_enabled = false
+                        [security]
+                          disable_initial_admin_creation = true
+                          cookie_secure = true
+                          cookie_samesite = none
+                          allow_embedding = true
+                        [auth]
+                          disable_login_form = true
+                        [auth.proxy]
+                          enabled = true
+                          header_name = X-WEBAUTH-USER
+                          header_property = username
+                          auto_sign_up = true
+                          sync_ttl = 15
+                          whitelist = 1::4
+                          headers_encoded = false
+                          enable_login_token = false
+                          headers = Role:X-WEBAUTH-ROLE\n""").lstrip(),  # noqa: W291
+                    "provisioning/datasources/ceph-dashboard.yml": y,
+                    'certs/cert_file': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_cert}""").lstrip(),
+                    'certs/cert_key': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_key}""").lstrip(),
+                    'provisioning/dashboards/default.yml': dedent("""
+                        # This file is generated by cephadm.
+                        apiVersion: 1
+
+                        providers:
+                          - name: 'Ceph Dashboard'
+                            orgId: 1
+                            folder: ''
+                            type: file
+                            disableDeletion: false
+                            updateIntervalSeconds: 3
+                            editable: false
+                            options:
+                              path: '/etc/grafana/provisioning/dashboards'""").lstrip(),
+                }
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "grafana.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'grafana.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [3000],
+                        },
+                        "meta": {
+                            'service_name': 'grafana',
+                            'ports': [3000],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": files,
+                        },
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
+    @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
+    @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    def test_grafana_config_with_mgmt_gw(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
+
+        y = dedent(f"""
+             # This file is generated by cephadm.
+             apiVersion: 1
+
+             deleteDatasources:
+               - name: 'Dashboard1'
+                 orgId: 1
+
+             datasources:
+               - name: 'Dashboard1'
+                 type: 'prometheus'
+                 access: 'proxy'
+                 orgId: 1
+                 url: 'https://host_fqdn:29443/internal/prometheus'
+                 basicAuth: true
+                 isDefault: true
+                 editable: false
+                 basicAuthUser: admin
+                 jsonData:
+                    graphiteVersion: "1.1"
+                    tlsAuth: false
+                    tlsAuthWithCACert: true
+                    tlsSkipVerify: false
+                 secureJsonData:
+                   basicAuthPassword: admin
+                   tlsCACert: "{cephadm_root_ca}"
+                   tlsClientCert: "{ceph_generated_cert}"
+                   tlsClientKey: "{ceph_generated_key}"
+
+               - name: 'Loki'
+                 type: 'loki'
+                 access: 'proxy'
+                 url: ''
+                 basicAuth: false
+                 isDefault: false
+                 editable: false""").lstrip()
+
+        with with_host(cephadm_module, "test"):
+            cephadm_module.cert_key_store.save_cert('grafana_cert', ceph_generated_cert, host='test')
+            cephadm_module.cert_key_store.save_key('grafana_key', ceph_generated_key, host='test')
+            with with_service(
+                cephadm_module, PrometheusSpec("prometheus")
+            ) as _, with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
+                with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
+                cephadm_module, GrafanaSpec("grafana")
+            ) as _:
+                files = {
+                    'grafana.ini': dedent("""
+                        # This file is generated by cephadm.
+                        [users]
+                          default_theme = light
+                        [auth.anonymous]
+                          enabled = true
+                          org_name = 'Main Org.'
+                          org_role = 'Viewer'
+                        [server]
+                          domain = 'host_fqdn'
+                          protocol = https
+                          cert_file = /etc/grafana/certs/cert_file
+                          cert_key = /etc/grafana/certs/cert_key
+                          http_port = 3000
+                          http_addr = 
+                          root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
+                          serve_from_sub_path = true
+                        [snapshots]
+                          external_enabled = false
+                        [security]
+                          disable_initial_admin_creation = true
+                          cookie_secure = true
+                          cookie_samesite = none
+                          allow_embedding = true\n""").lstrip(),  # noqa: W291
+                    "provisioning/datasources/ceph-dashboard.yml": y,
+                    'certs/cert_file': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_cert}""").lstrip(),
+                    'certs/cert_key': dedent(f"""
+                        # generated by cephadm\n{ceph_generated_key}""").lstrip(),
+                    'provisioning/dashboards/default.yml': dedent("""
+                        # This file is generated by cephadm.
+                        apiVersion: 1
+
+                        providers:
+                          - name: 'Ceph Dashboard'
+                            orgId: 1
+                            folder: ''
+                            type: file
+                            disableDeletion: false
+                            updateIntervalSeconds: 3
+                            editable: false
+                            options:
+                              path: '/etc/grafana/provisioning/dashboards'""").lstrip(),
+                }
+
+                _run_cephadm.assert_called_with(
+                    'test',
+                    "grafana.test",
+                    ['_orch', 'deploy'],
+                    [],
+                    stdin=json.dumps({
+                        "fsid": "fsid",
+                        "name": 'grafana.test',
+                        "image": '',
+                        "deploy_arguments": [],
+                        "params": {
+                            'tcp_ports': [3000],
+                        },
+                        "meta": {
+                            'service_name': 'grafana',
+                            'ports': [3000],
+                            'ip': None,
+                            'deployed_by': [],
+                            'rank': None,
+                            'rank_generation': None,
+                            'extra_container_args': None,
+                            'extra_entrypoint_args': None,
+                        },
+                        "config_blobs": {
+                            "files": files,
+                        },
+                    }),
+                    use_current_daemon_image=False,
+                )
+
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '1::4')
+    @patch("cephadm.module.CephadmOrchestrator.get_fqdn", lambda a, b: 'host_fqdn')
+    @patch("cephadm.services.monitoring.verify_tls", lambda *_: None)
     def test_grafana_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
         _run_cephadm.side_effect = async_side_effect(("{}", "", 0))
 
@@ -2710,6 +3191,7 @@ class TestIngressService:
             '        Enable_RQUOTA = false;\n'
             '        Protocols = 4;\n'
             '        NFS_Port = 2049;\n'
+            '        allow_set_io_flusher_fail = true;\n'
             '        HAProxy_Hosts = 192.168.122.111, 10.10.2.20, 192.168.122.222;\n'
             '}\n'
             '\n'
@@ -3289,14 +3771,19 @@ class TestSMB:
 class TestMgmtGateway:
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_endpoints")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_discovery_endpoints")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
            lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
-           lambda instance, dspec: (ceph_generated_cert, ceph_generated_key))
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    def test_mgmt_gateway_config_no_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gateway_config_no_auth(self,
+                                         get_service_discovery_endpoints_mock: List[str],
+                                         get_service_endpoints_mock: List[str],
+                                         _run_cephadm,
+                                         cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3309,6 +3796,7 @@ class TestMgmtGateway:
 
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
         get_service_endpoints_mock.side_effect = get_services_endpoints
+        get_service_discovery_endpoints_mock.side_effect = lambda: ["ceph-node-0:8765", "ceph-node-2:8765"]
 
         server_port = 5555
         spec = MgmtGatewaySpec(port=server_port,
@@ -3343,6 +3831,7 @@ class TestMgmtGateway:
 
                                          http {
 
+                                             #access_log /dev/stdout;
                                              client_header_buffer_size 32K;
                                              large_client_header_buffers 4 32k;
                                              proxy_busy_buffers_size 512k;
@@ -3351,6 +3840,12 @@ class TestMgmtGateway:
                                              proxy_headers_hash_max_size 1024;
                                              proxy_headers_hash_bucket_size 128;
 
+
+                                             upstream service_discovery_servers {
+                                              server ceph-node-0:8765;
+                                              server ceph-node-2:8765;
+                                             }
+
                                              upstream dashboard_servers {
                                               server ceph-node-2:8443;
                                               server ceph-node-2:8443;
@@ -3417,11 +3912,11 @@ class TestMgmtGateway:
                                                  }
 
                                                  location /grafana {
-                                                     rewrite ^/grafana/(.*) /$1 break;
                                                      proxy_pass https://grafana_servers;
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
                                                      proxy_set_header Authorization "";
+                                                     proxy_buffering off;
                                                  }
 
                                                  location /prometheus {
@@ -3446,6 +3941,9 @@ class TestMgmtGateway:
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -3455,6 +3953,12 @@ class TestMgmtGateway:
                                                  ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
                                                  ssl_prefer_server_ciphers on;
 
+                                                 location /internal/sd {
+                                                     rewrite ^/internal/(.*) /$1 break;
+                                                     proxy_pass https://service_discovery_servers;
+                                                     proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
+                                                 }
+
                                                  location /internal/dashboard {
                                                      rewrite ^/internal/dashboard/(.*) /$1 break;
                                                      proxy_pass https://dashboard_servers;
@@ -3510,15 +4014,19 @@ class TestMgmtGateway:
 
     @patch("cephadm.serve.CephadmServe._run_cephadm")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_endpoints")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_discovery_endpoints")
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
            lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
-           lambda instance, dspec: (ceph_generated_cert, ceph_generated_key))
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_oauth2_service_url", lambda _: "https://192.168.100.102:4180")
-    def test_mgmt_gateway_config_with_auth(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_mgmt_gateway_config_with_auth(self,
+                                           get_service_discovery_endpoints_mock: List[str],
+                                           get_service_endpoints_mock: List[str],
+                                           _run_cephadm,
+                                           cephadm_module: CephadmOrchestrator):
 
         def get_services_endpoints(name):
             if name == 'prometheus':
@@ -3527,10 +4035,13 @@ class TestMgmtGateway:
                 return ["ceph-node-2:3000", "ceph-node-2:3000"]
             elif name == 'alertmanager':
                 return ["192.168.100.100:9093", "192.168.100.102:9093"]
+            elif name == 'oauth2-proxy':
+                return ["192.168.100.101:4180", "192.168.100.102:4180"]
             return []
 
         _run_cephadm.side_effect = async_side_effect(('{}', '', 0))
         get_service_endpoints_mock.side_effect = get_services_endpoints
+        get_service_discovery_endpoints_mock.side_effect = lambda: ["ceph-node-0:8765", "ceph-node-2:8765"]
 
         server_port = 5555
         spec = MgmtGatewaySpec(port=server_port,
@@ -3566,6 +4077,7 @@ class TestMgmtGateway:
 
                                          http {
 
+                                             #access_log /dev/stdout;
                                              client_header_buffer_size 32K;
                                              large_client_header_buffers 4 32k;
                                              proxy_busy_buffers_size 512k;
@@ -3574,6 +4086,16 @@ class TestMgmtGateway:
                                              proxy_headers_hash_max_size 1024;
                                              proxy_headers_hash_bucket_size 128;
 
+                                             upstream oauth2_proxy_servers {
+                                              server 192.168.100.101:4180;
+                                              server 192.168.100.102:4180;
+                                             }
+
+                                             upstream service_discovery_servers {
+                                              server ceph-node-0:8765;
+                                              server ceph-node-2:8765;
+                                             }
+
                                              upstream dashboard_servers {
                                               server ceph-node-2:8443;
                                               server ceph-node-2:8443;
@@ -3634,7 +4156,7 @@ class TestMgmtGateway:
                                                  # add_header Content-Security-Policy "default-src 'self'; script-src 'self'; object-src 'none'; base-uri 'none'; require-trusted-types-for 'script'; frame-ancestors 'self';";
 
                                                  location /oauth2/ {
-                                                     proxy_pass https://192.168.100.102:4180;
+                                                     proxy_pass https://oauth2_proxy_servers;
                                                      proxy_set_header Host $host;
                                                      proxy_set_header X-Real-IP $remote_addr;
                                                      proxy_set_header X-Scheme $scheme;
@@ -3644,7 +4166,7 @@ class TestMgmtGateway:
 
                                                  location = /oauth2/auth {
                                                      internal;
-                                                     proxy_pass https://192.168.100.102:4180;
+                                                     proxy_pass https://oauth2_proxy_servers;
                                                      proxy_set_header Host $host;
                                                      proxy_set_header X-Real-IP $remote_addr;
                                                      proxy_set_header X-Scheme $scheme;
@@ -3689,11 +4211,11 @@ class TestMgmtGateway:
                                                  }
 
                                                  location /grafana {
-                                                     rewrite ^/grafana/(.*) /$1 break;
                                                      proxy_pass https://grafana_servers;
                                                      # clear any Authorization header as Prometheus and Alertmanager are using basic-auth browser
                                                      # will send this header if Grafana is running on the same node as one of those services
                                                      proxy_set_header Authorization "";
+                                                     proxy_buffering off;
                                                      auth_request /oauth2/auth;
                                                      error_page 401 = /oauth2/sign_in;
 
@@ -3760,6 +4282,9 @@ class TestMgmtGateway:
                                              }"""),
                     "nginx_internal_server.conf": dedent("""
                                              server {
+                                                 ssl_client_certificate /etc/nginx/ssl/ca.crt;
+                                                 ssl_verify_client on;
+
                                                  listen              29443 ssl;
                                                  listen              [::]:29443 ssl;
                                                  ssl_certificate     /etc/nginx/ssl/nginx_internal.crt;
@@ -3769,6 +4294,12 @@ class TestMgmtGateway:
                                                  ssl_ciphers         ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:DHE-RSA-CHACHA20-POLY1305;
                                                  ssl_prefer_server_ciphers on;
 
+                                                 location /internal/sd {
+                                                     rewrite ^/internal/(.*) /$1 break;
+                                                     proxy_pass https://service_discovery_servers;
+                                                     proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
+                                                 }
+
                                                  location /internal/dashboard {
                                                      rewrite ^/internal/dashboard/(.*) /$1 break;
                                                      proxy_pass https://dashboard_servers;
@@ -3827,12 +4358,26 @@ class TestMgmtGateway:
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
            lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
-           lambda instance, dspec: (ceph_generated_cert, ceph_generated_key))
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
     @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
     @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
     @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
-    def test_oauth2_proxy_service(self, get_service_endpoints_mock: List[str], _run_cephadm, cephadm_module: CephadmOrchestrator):
+    def test_oauth2_proxy_service(self, get_service_endpoints_mock, _run_cephadm, cephadm_module):
+        self.oauth2_proxy_service_common(get_service_endpoints_mock, _run_cephadm, cephadm_module, virtual_ip=None)
 
+    @patch("cephadm.serve.CephadmServe._run_cephadm")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_service_endpoints")
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_external_certificates",
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
+    @patch("cephadm.services.mgmt_gateway.MgmtGatewayService.get_internal_certificates",
+           lambda instance, svc_spec, dspec: (ceph_generated_cert, ceph_generated_key))
+    @patch("cephadm.module.CephadmOrchestrator.get_mgr_ip", lambda _: '::1')
+    @patch('cephadm.cert_mgr.CertMgr.get_root_ca', lambda instance: cephadm_root_ca)
+    @patch("cephadm.services.mgmt_gateway.get_dashboard_endpoints", lambda _: (["ceph-node-2:8443", "ceph-node-2:8443"], "https"))
+    def test_oauth2_proxy_service_with_ha(self, get_service_endpoints_mock, _run_cephadm, cephadm_module):
+        self.oauth2_proxy_service_common(get_service_endpoints_mock, _run_cephadm, cephadm_module, virtual_ip="192.168.100.200")
+
+    def oauth2_proxy_service_common(self, get_service_endpoints_mock, _run_cephadm, cephadm_module: CephadmOrchestrator, virtual_ip=None):
         def get_services_endpoints(name):
             if name == 'prometheus':
                 return ["192.168.100.100:9095", "192.168.100.101:9095"]
@@ -3849,7 +4394,8 @@ class TestMgmtGateway:
         mgmt_gw_spec = MgmtGatewaySpec(port=server_port,
                                        ssl_certificate=ceph_generated_cert,
                                        ssl_certificate_key=ceph_generated_key,
-                                       enable_auth=True)
+                                       enable_auth=True,
+                                       virtual_ip=virtual_ip)
 
         oauth2_spec = OAuth2ProxySpec(provider_display_name='my_idp_provider',
                                       client_id='my_client_id',
@@ -3858,6 +4404,8 @@ class TestMgmtGateway:
                                       cookie_secret='kbAEM9opAmuHskQvt0AW8oeJRaOM2BYy5Loba0kZ0SQ=',
                                       ssl_certificate=ceph_generated_cert,
                                       ssl_certificate_key=ceph_generated_key)
+
+        redirect_url = f"https://{virtual_ip if virtual_ip else 'host_fqdn'}:5555/oauth2/callback"
         expected = {
             "fsid": "fsid",
             "name": "oauth2-proxy.ceph-node",
@@ -3876,7 +4424,7 @@ class TestMgmtGateway:
             },
             "config_blobs": {
                 "files": {
-                    "oauth2-proxy.conf": dedent("""
+                    "oauth2-proxy.conf": dedent(f"""
                                          # Listen on port 4180 for incoming HTTP traffic.
                                          https_address= "0.0.0.0:4180"
 
@@ -3889,7 +4437,7 @@ class TestMgmtGateway:
                                          client_id= "my_client_id"
                                          client_secret= "my_client_secret"
                                          oidc_issuer_url= "http://192.168.10.10:8888/dex"
-                                         redirect_url= "https://host_fqdn:5555/oauth2/callback"
+                                         redirect_url= "{redirect_url}"
 
                                          ssl_insecure_skip_verify=true
 
diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py
index 78a2d73118f..42e590945cd 100644
--- a/src/pybind/mgr/cephadm/tests/test_spec.py
+++ b/src/pybind/mgr/cephadm/tests/test_spec.py
@@ -130,7 +130,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "d94d7969094d",
         "container_image_id": "0881eb8f169f5556a292b4e2c01d683172b12830a62a9225a98a8e206bb734f0",
-        "container_image_name": "docker.io/prom/alertmanager:latest",
+        "container_image_name": "quay.io/prometheus/alertmanager:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "alertmanager",
         "version": "0.20.0",
@@ -145,7 +145,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "c4b036202241",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "crash",
         "version": "15.2.0",
@@ -160,7 +160,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "5b7b94b48f31",
         "container_image_id": "87a51ecf0b1c9a7b187b21c1b071425dafea0d765a96d5bc371c791169b3d7f4",
-        "container_image_name": "docker.io/ceph/ceph-grafana:latest",
+        "container_image_name": "quay.io/ceph/ceph-grafana:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "grafana",
         "version": "6.6.2",
@@ -175,7 +175,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "9ca007280456",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001.gkjwqp",
         "daemon_type": "mgr",
         "version": "15.2.0",
@@ -190,7 +190,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "3d1ba9a2b697",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "ceph-001",
         "daemon_type": "mon",
         "version": "15.2.0",
@@ -205,7 +205,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "36d026c68ba1",
         "container_image_id": "e5a616e4b9cf68dfcad7782b78e118be4310022e874d52da85c55923fb615f87",
-        "container_image_name": "docker.io/prom/node-exporter:latest",
+        "container_image_name": "quay.io/prometheus/node-exporter:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "node-exporter",
         "version": "0.18.1",
@@ -220,7 +220,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "faf76193cbfe",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "0",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -235,7 +235,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "f82505bae0f1",
         "container_image_id": "204a01f9b0b6710dd0c0af7f37ce7139c47ff0f0105d778d7104c69282dfbbf1",
-        "container_image_name": "docker.io/ceph/ceph:v15",
+        "container_image_name": "quay.io/ceph/ceph:v15",
         "daemon_id": "1",
         "daemon_type": "osd",
         "version": "15.2.0",
@@ -250,7 +250,7 @@ def test_spec_octopus(spec_json):
         "hostname": "ceph-001",
         "container_id": "2708d84cd484",
         "container_image_id": "358a0d2395fe711bb8258e8fb4b2d7865c0a9a6463969bcd1452ee8869ea6653",
-        "container_image_name": "docker.io/prom/prometheus:latest",
+        "container_image_name": "quay.io/prom/prometheus:latest",
         "daemon_id": "ceph-001",
         "daemon_type": "prometheus",
         "version": "2.17.1",
@@ -569,7 +569,7 @@ def test_dd_octopus(dd_json):
         CustomContainerSpec(
             service_type='container',
             service_id='hello-world',
-            image='docker.io/library/hello-world:latest',
+            image='quay.io/hello-world/hello-world:latest',
         ),
         DaemonDescription(
             daemon_type='container',
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index d8ffab2da51..ed3d26807e5 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -29,17 +29,17 @@ CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
 def normalize_image_digest(digest: str, default_registry: str) -> str:
     """
     Normal case:
-    >>> normalize_image_digest('ceph/ceph', 'docker.io')
-    'docker.io/ceph/ceph'
+    >>> normalize_image_digest('ceph/ceph', 'quay.io')
+    'quay.io/ceph/ceph'
 
     No change:
-    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
+    >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'quay.io')
     'quay.ceph.io/ceph/ceph'
 
-    >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
-    'docker.io/ubuntu'
+    >>> normalize_image_digest('quay.io/centos', 'quay.io')
+    'quay.io/centos'
 
-    >>> normalize_image_digest('localhost/ceph', 'docker.io')
+    >>> normalize_image_digest('localhost/ceph', 'quay.io')
     'localhost/ceph'
     """
     known_shortnames = [
diff --git a/src/pybind/mgr/cephadm/utils.py b/src/pybind/mgr/cephadm/utils.py
index 3673fbf621c..edd775aa178 100644
--- a/src/pybind/mgr/cephadm/utils.py
+++ b/src/pybind/mgr/cephadm/utils.py
@@ -5,6 +5,7 @@ from enum import Enum
 from functools import wraps
 from typing import Optional, Callable, TypeVar, List, NewType, TYPE_CHECKING, Any, NamedTuple
 from orchestrator import OrchestratorError
+import hashlib
 
 if TYPE_CHECKING:
     from cephadm import CephadmOrchestrator
@@ -154,3 +155,9 @@ def file_mode_to_str(mode: int) -> str:
             f'{"x" if (mode >> shift) & 1 else "-"}'
         ) + r
     return r
+
+
+def md5_hash(input_value: str) -> str:
+    input_str = str(input_value).encode('utf-8')
+    hash_object = hashlib.md5(input_str)
+    return hash_object.hexdigest()
diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index ec9c9897081..519c310a98b 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -63,7 +63,10 @@ else:
 
         @EndpointDoc(
             "Get information from a specific NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_model(model.Subsystem, first="subsystems")
         @handle_nvmeof_error
@@ -78,6 +81,7 @@ else:
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024),
                 "enable_ha": Param(bool, "Enable high availability"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -95,6 +99,7 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "force": Param(bool, "Force delete", "false"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -111,12 +116,15 @@ else:
     class NVMeoFListener(RESTController):
         @EndpointDoc(
             "List all NVMeoF listeners",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Listener, pick="listeners")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_listeners(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_listeners(
                 NVMeoFClient.pb2.list_listeners_req(subsystem=nqn)
             )
 
@@ -128,6 +136,7 @@ else:
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -138,9 +147,10 @@ else:
             host_name: str,
             traddr: str,
             trsvcid: int = 4420,
-            adrfam: int = 0,  # IPv4
+            adrfam: int = 0,  # IPv4,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient().stub.create_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.create_listener(
                 NVMeoFClient.pb2.create_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -158,6 +168,7 @@ else:
                 "traddr": Param(str, "NVMeoF transport address"),
                 "trsvcid": Param(int, "NVMeoF transport service port", True, 4420),
                 "adrfam": Param(int, "NVMeoF address family (0 - IPv4, 1 - IPv6)", True, 0),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -170,8 +181,9 @@ else:
             trsvcid: int = 4420,
             adrfam: int = 0,  # IPv4
             force: bool = False,
+            gw_group: Optional[str] = None
         ):
-            return NVMeoFClient().stub.delete_listener(
+            return NVMeoFClient(gw_group=gw_group, traddr=traddr).stub.delete_listener(
                 NVMeoFClient.pb2.delete_listener_req(
                     nqn=nqn,
                     host_name=host_name,
@@ -187,12 +199,15 @@ else:
     class NVMeoFNamespace(RESTController):
         @EndpointDoc(
             "List all NVMeoF namespaces in a subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Namespace, pick="namespaces")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn)
             )
 
@@ -201,12 +216,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.Namespace, first="namespaces")
         @handle_nvmeof_error
-        def get(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.list_namespaces(
+        def get(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_namespaces(
                 NVMeoFClient.pb2.list_namespaces_req(subsystem=nqn, nsid=int(nsid))
             )
 
@@ -217,12 +233,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceIOStats)
         @handle_nvmeof_error
-        def io_stats(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_get_io_stats(
+        def io_stats(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_get_io_stats(
                 NVMeoFClient.pb2.namespace_get_io_stats_req(
                     subsystem_nqn=nqn, nsid=int(nsid))
             )
@@ -237,6 +254,7 @@ else:
                 "size": Param(int, "RBD image size"),
                 "block_size": Param(int, "NVMeoF namespace block size"),
                 "load_balancing_group": Param(int, "Load balancing group"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @map_model(model.NamespaceCreation)
@@ -250,8 +268,9 @@ else:
             size: Optional[int] = 1024,
             block_size: int = 512,
             load_balancing_group: Optional[int] = None,
+            gw_group: Optional[str] = None,
         ):
-            return NVMeoFClient().stub.namespace_add(
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_add(
                 NVMeoFClient.pb2.namespace_add_req(
                     subsystem_nqn=nqn,
                     rbd_image_name=rbd_image_name,
@@ -274,6 +293,7 @@ else:
                 "rw_mbytes_per_second": Param(int, "Read/Write MB/s"),
                 "r_mbytes_per_second": Param(int, "Read MB/s"),
                 "w_mbytes_per_second": Param(int, "Write MB/s"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
@@ -288,12 +308,13 @@ else:
             rw_mbytes_per_second: Optional[int] = None,
             r_mbytes_per_second: Optional[int] = None,
             w_mbytes_per_second: Optional[int] = None,
+            gw_group: Optional[str] = None
         ):
             if rbd_image_size:
                 mib = 1024 * 1024
                 new_size_mib = int((rbd_image_size + mib - 1) / mib)
 
-                response = NVMeoFClient().stub.namespace_resize(
+                response = NVMeoFClient(gw_group=gw_group).stub.namespace_resize(
                     NVMeoFClient.pb2.namespace_resize_req(
                         subsystem_nqn=nqn, nsid=int(nsid), new_size=new_size_mib
                     )
@@ -336,12 +357,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "nsid": Param(str, "NVMeoF Namespace ID"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, nsid: str):
-            return NVMeoFClient().stub.namespace_delete(
+        def delete(self, nqn: str, nsid: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.namespace_delete(
                 NVMeoFClient.pb2.namespace_delete_req(subsystem_nqn=nqn, nsid=int(nsid))
             )
 
@@ -351,7 +373,10 @@ else:
     class NVMeoFHost(RESTController):
         @EndpointDoc(
             "List all allowed hosts for an NVMeoF subsystem",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(
             model.Host,
@@ -362,8 +387,8 @@ else:
             else o,
         )
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_hosts(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_hosts(
                 NVMeoFClient.pb2.list_hosts_req(subsystem=nqn)
             )
 
@@ -372,12 +397,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to allow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def create(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.add_host(
+        def create(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.add_host(
                 NVMeoFClient.pb2.add_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -386,12 +412,13 @@ else:
             parameters={
                 "nqn": Param(str, "NVMeoF subsystem NQN"),
                 "host_nqn": Param(str, 'NVMeoF host NQN. Use "*" to disallow any host.'),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
             },
         )
         @empty_response
         @handle_nvmeof_error
-        def delete(self, nqn: str, host_nqn: str):
-            return NVMeoFClient().stub.remove_host(
+        def delete(self, nqn: str, host_nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.remove_host(
                 NVMeoFClient.pb2.remove_host_req(subsystem_nqn=nqn, host_nqn=host_nqn)
             )
 
@@ -400,12 +427,15 @@ else:
     class NVMeoFConnection(RESTController):
         @EndpointDoc(
             "List all NVMeoF Subsystem Connections",
-            parameters={"nqn": Param(str, "NVMeoF subsystem NQN")},
+            parameters={
+                "nqn": Param(str, "NVMeoF subsystem NQN"),
+                "gw_group": Param(str, "NVMeoF gateway group", True, None),
+            },
         )
         @map_collection(model.Connection, pick="connections")
         @handle_nvmeof_error
-        def list(self, nqn: str):
-            return NVMeoFClient().stub.list_connections(
+        def list(self, nqn: str, gw_group: Optional[str] = None):
+            return NVMeoFClient(gw_group=gw_group).stub.list_connections(
                 NVMeoFClient.pb2.list_connections_req(subsystem=nqn)
             )
 
@@ -433,16 +463,17 @@ else:
                      parameters={
                          'subsystem_nqn': (str, 'Subsystem NQN'),
                          "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQNs'),
+                         "gw_group": Param(str, "NVMeoF gateway group")
                      })
         @empty_response
         @handle_nvmeof_error
         @CreatePermission
-        def add(self, subsystem_nqn: str, host_nqn: str = ""):
+        def add(self, subsystem_nqn: str, gw_group: str, host_nqn: str = ""):
             response = None
             all_host_nqns = host_nqn.split(',')
 
             for nqn in all_host_nqns:
-                response = NVMeoFClient().stub.add_host(
+                response = NVMeoFClient(gw_group=gw_group).stub.add_host(
                     NVMeoFClient.pb2.add_host_req(subsystem_nqn=subsystem_nqn, host_nqn=nqn)
                 )
                 if response.status != 0:
@@ -454,16 +485,17 @@ else:
                      parameters={
                          "subsystem_nqn": Param(str, "NVMeoF subsystem NQN"),
                          "host_nqn": Param(str, 'Comma separated list of NVMeoF host NQN.'),
+                         "gw_group": Param(str, "NVMeoF gateway group")
                      })
         @empty_response
         @handle_nvmeof_error
         @DeletePermission
-        def remove(self, subsystem_nqn: str, host_nqn: str):
+        def remove(self, subsystem_nqn: str, host_nqn: str, gw_group: str):
             response = None
             to_delete_nqns = host_nqn.split(',')
 
             for del_nqn in to_delete_nqns:
-                response = NVMeoFClient().stub.remove_host(
+                response = NVMeoFClient(gw_group=gw_group).stub.remove_host(
                     NVMeoFClient.pb2.remove_host_req(subsystem_nqn=subsystem_nqn, host_nqn=del_nqn)
                 )
                 if response.status != 0:
diff --git a/src/pybind/mgr/dashboard/controllers/osd.py b/src/pybind/mgr/dashboard/controllers/osd.py
index c9d14177200..07d8db7755b 100644
--- a/src/pybind/mgr/dashboard/controllers/osd.py
+++ b/src/pybind/mgr/dashboard/controllers/osd.py
@@ -5,12 +5,14 @@ import logging
 import time
 from typing import Any, Dict, List, Optional, Union
 
+import cherrypy
 from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError  # type: ignore
 from mgr_util import get_most_recent_rate
 
 from .. import mgr
 from ..exceptions import DashboardException
 from ..security import Scope
+from ..services._paginate import ListPaginator
 from ..services.ceph_service import CephService, SendCommandError
 from ..services.exception import handle_orchestrator_error, handle_send_command_error
 from ..services.orchestrator import OrchClient, OrchFeature
@@ -121,8 +123,30 @@ def osd_task(name, metadata, wait_for=2.0):
 @APIRouter('/osd', Scope.OSD)
 @APIDoc('OSD management API', 'OSD')
 class Osd(RESTController):
-    def list(self):
-        osds = self.get_osd_map()
+    @RESTController.MethodMap(version=APIVersion(1, 1))
+    def list(self, offset: int = 0, limit: int = 10,
+             search: str = '', sort: str = ''):
+        all_osds = self.get_osd_map()
+
+        paginator = ListPaginator(int(offset), int(limit), sort, search,
+                                  input_list=all_osds.values(),
+                                  searchable_params=['id'],
+                                  sortable_params=['id'],
+                                  default_sort='+id')
+
+        cherrypy.response.headers['X-Total-Count'] = paginator.get_count()
+
+        paginated_osds_list = list(paginator.list())
+        # creating a dictionary to have faster lookups
+        paginated_osds_by_id = {osd['id']: osd for osd in paginated_osds_list}
+        try:
+            osds = {
+                key: paginated_osds_by_id[int(key)]
+                for key in all_osds.keys()
+                if int(key) in paginated_osds_by_id
+            }
+        except ValueError as e:
+            raise DashboardException(e, component='osd', http_status_code=400)
 
         # Extending by osd stats information
         for stat in mgr.get('osd_stats')['osd_stats']:
diff --git a/src/pybind/mgr/dashboard/controllers/rgw.py b/src/pybind/mgr/dashboard/controllers/rgw.py
index 8667d469060..9d257674794 100755
--- a/src/pybind/mgr/dashboard/controllers/rgw.py
+++ b/src/pybind/mgr/dashboard/controllers/rgw.py
@@ -16,6 +16,7 @@ from ..services.auth import AuthManager, JwtManager
 from ..services.ceph_service import CephService
 from ..services.rgw_client import _SYNC_GROUP_ID, NoRgwDaemonsException, \
     RgwClient, RgwMultisite, RgwMultisiteAutomation
+from ..services.rgw_iam import RgwAccounts
 from ..services.service import RgwServiceManager, wait_for_daemon_to_start
 from ..tools import json_str_to_object, str_to_bool
 from . import APIDoc, APIRouter, BaseController, CreatePermission, \
@@ -162,9 +163,9 @@ class RgwMultisiteController(RESTController):
     @ReadPermission
     @allow_empty_body
     # pylint: disable=W0102,W0613
-    def get_sync_status(self):
+    def get_sync_status(self, daemon_name=None):
         multisite_instance = RgwMultisite()
-        result = multisite_instance.get_multisite_sync_status()
+        result = multisite_instance.get_multisite_sync_status(daemon_name)
         return result
 
     @Endpoint(path='/sync-policy')
@@ -176,6 +177,15 @@ class RgwMultisiteController(RESTController):
         if all_policy:
             sync_policy_list = []
             buckets = json.loads(RgwBucket().list(stats=False))
+            zonegroups_info = RgwMultisite().get_all_zonegroups_info()
+            default_zonegroup = ''
+            if 'zonegroups' in zonegroups_info and 'default_zonegroup' in zonegroups_info:
+                default_zonegroup = next(
+                    (zonegroup['name'] for zonegroup in zonegroups_info['zonegroups']
+                        if 'id' in zonegroup and 'name' in zonegroup
+                        and zonegroup['id'] == zonegroups_info['default_zonegroup']),
+                    ''
+                )
             for bucket in buckets:
                 sync_policy = multisite_instance.get_sync_policy(bucket, zonegroup_name)
                 for policy in sync_policy['groups']:
@@ -183,6 +193,7 @@ class RgwMultisiteController(RESTController):
                     sync_policy_list.append(policy)
             other_sync_policy = multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
             for policy in other_sync_policy['groups']:
+                policy['zonegroup'] = default_zonegroup
                 sync_policy_list.append(policy)
             return sync_policy_list
         return multisite_instance.get_sync_policy(bucket_name, zonegroup_name)
@@ -244,11 +255,13 @@ class RgwMultisiteController(RESTController):
                          source_zones: Dict[str, Any],
                          destination_zones: Dict[str, Any],
                          source_bucket: str = '',
-                         destination_bucket: str = '', bucket_name: str = ''):
+                         destination_bucket: str = '', bucket_name: str = '',
+                         user: str = '', mode: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.create_sync_pipe(group_id, pipe_id, source_zones,
                                                    destination_zones, source_bucket,
-                                                   destination_bucket, bucket_name, True)
+                                                   destination_bucket, bucket_name, True,
+                                                   user, mode)
 
     @Endpoint(method='DELETE', path='/sync-pipe')
     @EndpointDoc("Remove the sync pipe")
@@ -256,12 +269,10 @@ class RgwMultisiteController(RESTController):
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '',
                          bucket_name: str = ''):
         multisite_instance = RgwMultisite()
         return multisite_instance.remove_sync_pipe(group_id, pipe_id, source_zones,
-                                                   destination_zones, destination_bucket,
-                                                   bucket_name, True)
+                                                   destination_zones, bucket_name, True)
 
 
 @APIRouter('/rgw/daemon', Scope.RGW)
@@ -389,6 +400,15 @@ class RgwBucket(RgwRESTController):
                 if bucket['tenant'] else bucket['bucket']
         return bucket
 
+    def _get_owner(self, owner):
+        accounts = RgwAccounts().get_accounts()
+
+        # if the owner is present in the accounts list,
+        # then the bucket is owned by an account.
+        # hence we will use dashboard user to fetch the
+        # bucket info
+        return owner if owner not in accounts else RgwServiceManager.user
+
     def _get_versioning(self, owner, daemon_name, bucket_name):
         rgw_client = RgwClient.instance(owner, daemon_name)
         return rgw_client.get_bucket_versioning(bucket_name)
@@ -532,19 +552,20 @@ class RgwBucket(RgwRESTController):
         bucket_name = RgwBucket.get_s3_bucket_name(result['bucket'],
                                                    result['tenant'])
 
+        owner = self._get_owner(result['owner'])
         # Append the versioning configuration.
-        versioning = self._get_versioning(result['owner'], daemon_name, bucket_name)
-        encryption = self._get_encryption(bucket_name, daemon_name, result['owner'])
+        versioning = self._get_versioning(owner, daemon_name, bucket_name)
+        encryption = self._get_encryption(bucket_name, daemon_name, owner)
         result['encryption'] = encryption['Status']
         result['versioning'] = versioning['Status']
         result['mfa_delete'] = versioning['MfaDelete']
-        result['bucket_policy'] = self._get_policy(bucket_name, daemon_name, result['owner'])
-        result['acl'] = self._get_acl(bucket_name, daemon_name, result['owner'])
-        result['replication'] = self._get_replication(bucket_name, result['owner'], daemon_name)
-        result['lifecycle'] = self._get_lifecycle(bucket_name, daemon_name, result['owner'])
+        result['bucket_policy'] = self._get_policy(bucket_name, daemon_name, owner)
+        result['acl'] = self._get_acl(bucket_name, daemon_name, owner)
+        result['replication'] = self._get_replication(bucket_name, owner, daemon_name)
+        result['lifecycle'] = self._get_lifecycle(bucket_name, daemon_name, owner)
 
         # Append the locking configuration.
-        locking = self._get_locking(result['owner'], daemon_name, bucket_name)
+        locking = self._get_locking(owner, daemon_name, bucket_name)
         result.update(locking)
 
         return self._append_bid(result)
@@ -589,7 +610,7 @@ class RgwBucket(RgwRESTController):
             raise DashboardException(e, http_status_code=500, component='rgw')
 
     @allow_empty_body
-    def set(self, bucket, bucket_id, uid, versioning_state=None,
+    def set(self, bucket, bucket_id, uid=None, versioning_state=None,
             encryption_state='false', encryption_type=None, key_id=None,
             mfa_delete=None, mfa_token_serial=None, mfa_token_pin=None,
             lock_mode=None, lock_retention_period_days=None,
@@ -599,23 +620,27 @@ class RgwBucket(RgwRESTController):
         encryption_state = str_to_bool(encryption_state)
         if replication is not None:
             replication = str_to_bool(replication)
-        # When linking a non-tenant-user owned bucket to a tenanted user, we
-        # need to prefix bucket name with '/'. e.g. photos -> /photos
-        if '$' in uid and '/' not in bucket:
-            bucket = '/{}'.format(bucket)
-
-        # Link bucket to new user:
-        result = self.proxy(daemon_name,
-                            'PUT',
-                            'bucket', {
-                                'bucket': bucket,
-                                'bucket-id': bucket_id,
-                                'uid': uid
-                            },
-                            json_response=False)
+
+        result = None
+        if uid:
+            # When linking a non-tenant-user owned bucket to a tenanted user, we
+            # need to prefix bucket name with '/'. e.g. photos -> /photos
+            if '$' in uid and '/' not in bucket:
+                bucket = '/{}'.format(bucket)
+
+            # Link bucket to new user:
+            result = self.proxy(daemon_name,
+                                'PUT',
+                                'bucket', {
+                                    'bucket': bucket,
+                                    'bucket-id': bucket_id,
+                                    'uid': uid
+                                },
+                                json_response=False)
 
         uid_tenant = uid[:uid.find('$')] if uid.find('$') >= 0 else None
         bucket_name = RgwBucket.get_s3_bucket_name(bucket, uid_tenant)
+        uid = self._get_owner(uid)
 
         locking = self._get_locking(uid, daemon_name, bucket_name)
         if versioning_state:
@@ -649,7 +674,7 @@ class RgwBucket(RgwRESTController):
             self._set_lifecycle(bucket_name, lifecycle, daemon_name, uid)
         else:
             self._delete_lifecycle(bucket_name, daemon_name, uid)
-        return self._append_bid(result)
+        return self._append_bid(result) if result else None
 
     def delete(self, bucket, purge_objects='true', daemon_name=None):
         return self.proxy(daemon_name, 'DELETE', 'bucket', {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts
index c966da9b9c2..0ddb8e2f611 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.ts
@@ -52,7 +52,7 @@ export class NvmeofGatewayComponent {
         prop: 'id'
       },
       {
-        name: $localize`Host name`,
+        name: $localize`Hostname`,
         prop: 'hostname'
       },
       {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
index 3a143a1a8df..32f7c76a362 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-form/nvmeof-initiators-form.component.ts
@@ -10,7 +10,7 @@ import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { ActivatedRoute, Router } from '@angular/router';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { InitiatorRequest, NvmeofService } from '~/app/shared/api/nvmeof.service';
 
 @Component({
   selector: 'cd-nvmeof-initiators-form',
@@ -26,6 +26,7 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
   remove: boolean = false;
   subsystemNQN: string;
   removeHosts: { name: string; value: boolean; id: number }[] = [];
+  group: string;
 
   constructor(
     private authStorageService: AuthStorageService,
@@ -52,6 +53,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
   );
 
   ngOnInit() {
+    this.route.queryParams.subscribe((params) => {
+      this.group = params?.['group'];
+    });
     this.createForm();
     this.action = this.actionLabels.ADD;
     this.route.params.subscribe((params: { subsystem_nqn: string }) => {
@@ -108,8 +112,9 @@ export class NvmeofInitiatorsFormComponent implements OnInit {
     const hosts: string[] = this.addedHosts.value;
     let taskUrl = `nvmeof/initiator/${URLVerbs.ADD}`;
 
-    const request = {
-      host_nqn: hosts.join(',')
+    const request: InitiatorRequest = {
+      host_nqn: hosts.join(','),
+      gw_group: this.group
     };
 
     if (allowAnyHost) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
index fff38e6985a..a5575a9c926 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit, TemplateRef, ViewChild } from '@angular/core';
+import { Component, Input, OnInit, TemplateRef, ViewChild } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -20,9 +20,11 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-initiators-list.component.html',
   styleUrls: ['./nvmeof-initiators-list.component.scss']
 })
-export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
+export class NvmeofInitiatorsListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
+  @Input()
+  group: string;
 
   @ViewChild('hostTpl', { static: true })
   hostTpl: TemplateRef<any>;
@@ -58,10 +60,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
         permission: 'create',
         icon: Icons.add,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }
-          ]),
+          this.router.navigate(
+            [BASE_URL, { outlets: { modal: [URLVerbs.ADD, this.subsystemNQN, 'initiator'] } }],
+            { queryParams: { group: this.group } }
+          ),
         canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
       },
       {
@@ -79,17 +81,13 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
     return this.selection.selected.findIndex((selected) => selected.nqn === '*');
   }
 
-  ngOnChanges() {
-    this.listInitiators();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listInitiators() {
     this.nvmeofService
-      .getInitiators(this.subsystemNQN)
+      .getInitiators(this.subsystemNQN, this.group)
       .subscribe((initiators: NvmeofSubsystemInitiator[]) => {
         this.initiators = initiators;
       });
@@ -118,7 +116,10 @@ export class NvmeofInitiatorsListComponent implements OnInit, OnChanges {
             nqn: this.subsystemNQN,
             plural: itemNames.length > 1
           }),
-          call: this.nvmeofService.removeInitiators(this.subsystemNQN, { host_nqn })
+          call: this.nvmeofService.removeInitiators(this.subsystemNQN, {
+            host_nqn,
+            gw_group: this.group
+          })
         })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
index cd362bf8abe..8310e65d203 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
@@ -103,7 +103,8 @@ export class NvmeofListenersFormComponent implements OnInit {
     const host = this.listenerForm.getValue('host');
     let trsvcid = Number(this.listenerForm.getValue('trsvcid'));
     if (!trsvcid) trsvcid = 4420;
-    const request = {
+    const request: ListenerRequest = {
+      gw_group: this.group,
       host_name: host.hostname,
       traddr: host.addr,
       trsvcid
@@ -128,9 +129,7 @@ export class NvmeofListenersFormComponent implements OnInit {
           component.listenerForm.setErrors({ cdSubmitButton: true });
         },
         complete: () => {
-          this.router.navigate([this.pageURL, { outlets: { modal: null } }], {
-            queryParams: { group: this.group }
-          });
+          this.router.navigate([this.pageURL, { outlets: { modal: null } }]);
         }
       });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
index f88442e1bd6..b49adda7c1b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit } from '@angular/core';
+import { Component, Input, OnInit } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -21,7 +21,7 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-listeners-list.component.html',
   styleUrls: ['./nvmeof-listeners-list.component.scss']
 })
-export class NvmeofListenersListComponent implements OnInit, OnChanges {
+export class NvmeofListenersListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
   @Input()
@@ -76,22 +76,18 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteListenerModal()
       }
     ];
   }
 
-  ngOnChanges() {
-    this.listListeners();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listListeners() {
     this.nvmeofService
-      .listListeners(this.subsystemNQN)
+      .listListeners(this.subsystemNQN, this.group)
       .subscribe((listResponse: NvmeofListener[]) => {
         this.listeners = listResponse.map((listener, index) => {
           listener['id'] = index;
@@ -101,7 +97,7 @@ export class NvmeofListenersListComponent implements OnInit, OnChanges {
       });
   }
 
-  deleteSubsystemModal() {
+  deleteListenerModal() {
     const listener = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Listener',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
index f5721e11ab6..b65ad62bdb4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.ts
@@ -41,6 +41,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
   nsid: string;
   currentBytes: number;
   invalidSizeError: boolean;
+  group: string;
 
   constructor(
     public actionLabels: ActionLabelsI18n,
@@ -62,6 +63,9 @@ export class NvmeofNamespacesFormComponent implements OnInit {
   }
 
   init() {
+    this.route.queryParams.subscribe((params) => {
+      this.group = params?.['group'];
+    });
     this.createForm();
     this.action = this.actionLabels.CREATE;
     this.route.params.subscribe((params: { subsystem_nqn: string; nsid: string }) => {
@@ -74,7 +78,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
     this.edit = true;
     this.action = this.actionLabels.EDIT;
     this.nvmeofService
-      .getNamespace(this.subsystemNQN, this.nsid)
+      .getNamespace(this.subsystemNQN, this.nsid, this.group)
       .subscribe((res: NvmeofSubsystemNamespace) => {
         const convertedSize = this.dimlessBinaryPipe.transform(res.rbd_image_size).split(' ');
         this.currentBytes = res.rbd_image_size;
@@ -120,6 +124,7 @@ export class NvmeofNamespacesFormComponent implements OnInit {
     const image_size = this.nsForm.getValue('image_size');
     const image_size_unit = this.nsForm.getValue('unit');
     const request = {} as NamespaceCreateRequest | NamespaceEditRequest;
+    request['gw_group'] = this.group;
     if (image_size) {
       const key: string = this.edit ? 'rbd_image_size' : 'size';
       const value: number = this.formatterService.toBytes(image_size + image_size_unit);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
index c40b538c820..8f8f6eb8d05 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-list/nvmeof-namespaces-list.component.ts
@@ -1,4 +1,4 @@
-import { Component, Input, OnChanges, OnInit } from '@angular/core';
+import { Component, Input, OnInit } from '@angular/core';
 import { Router } from '@angular/router';
 import { NvmeofService } from '~/app/shared/api/nvmeof.service';
 import { CriticalConfirmationModalComponent } from '~/app/shared/components/critical-confirmation-modal/critical-confirmation-modal.component';
@@ -23,9 +23,11 @@ const BASE_URL = 'block/nvmeof/subsystems';
   templateUrl: './nvmeof-namespaces-list.component.html',
   styleUrls: ['./nvmeof-namespaces-list.component.scss']
 })
-export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
+export class NvmeofNamespacesListComponent implements OnInit {
   @Input()
   subsystemNQN: string;
+  @Input()
+  group: string;
 
   namespacesColumns: any;
   tableActions: CdTableAction[];
@@ -117,10 +119,10 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
         permission: 'create',
         icon: Icons.add,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }
-          ]),
+          this.router.navigate(
+            [BASE_URL, { outlets: { modal: [URLVerbs.CREATE, this.subsystemNQN, 'namespace'] } }],
+            { queryParams: { group: this.group } }
+          ),
         canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
       },
       {
@@ -128,41 +130,45 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
         permission: 'update',
         icon: Icons.edit,
         click: () =>
-          this.router.navigate([
-            BASE_URL,
-            {
-              outlets: {
-                modal: [URLVerbs.EDIT, this.subsystemNQN, 'namespace', this.selection.first().nsid]
+          this.router.navigate(
+            [
+              BASE_URL,
+              {
+                outlets: {
+                  modal: [
+                    URLVerbs.EDIT,
+                    this.subsystemNQN,
+                    'namespace',
+                    this.selection.first().nsid
+                  ]
+                }
               }
-            }
-          ])
+            ],
+            { queryParams: { group: this.group } }
+          )
       },
       {
         name: this.actionLabels.DELETE,
         permission: 'delete',
         icon: Icons.destroy,
-        click: () => this.deleteSubsystemModal()
+        click: () => this.deleteNamespaceModal()
       }
     ];
   }
 
-  ngOnChanges() {
-    this.listNamespaces();
-  }
-
   updateSelection(selection: CdTableSelection) {
     this.selection = selection;
   }
 
   listNamespaces() {
     this.nvmeofService
-      .listNamespaces(this.subsystemNQN)
+      .listNamespaces(this.subsystemNQN, this.group)
       .subscribe((res: NvmeofSubsystemNamespace[]) => {
         this.namespaces = res;
       });
   }
 
-  deleteSubsystemModal() {
+  deleteNamespaceModal() {
     const namespace = this.selection.first();
     this.modalService.show(CriticalConfirmationModalComponent, {
       itemDescription: 'Namespace',
@@ -174,7 +180,7 @@ export class NvmeofNamespacesListComponent implements OnInit, OnChanges {
             nqn: this.subsystemNQN,
             nsid: namespace.nsid
           }),
-          call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid)
+          call: this.nvmeofService.deleteNamespace(this.subsystemNQN, namespace.nsid, this.group)
         })
     });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
index 7f15a1360ad..58a1e01a525 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-details/nvmeof-subsystems-details.component.html
@@ -24,14 +24,18 @@
       <a ngbNavLink
          i18n>Namespaces</a>
       <ng-template ngbNavContent>
-        <cd-nvmeof-namespaces-list [subsystemNQN]="subsystemNQN"></cd-nvmeof-namespaces-list>
+        <cd-nvmeof-namespaces-list [subsystemNQN]="subsystemNQN"
+                                   [group]="group">
+        </cd-nvmeof-namespaces-list>
       </ng-template>
     </ng-container>
     <ng-container ngbNavItem="initiators">
       <a ngbNavLink
          i18n>Initiators</a>
       <ng-template ngbNavContent>
-        <cd-nvmeof-initiators-list [subsystemNQN]="subsystemNQN"></cd-nvmeof-initiators-list>
+        <cd-nvmeof-initiators-list [subsystemNQN]="subsystemNQN"
+                                   [group]="group">
+        </cd-nvmeof-initiators-list>
       </ng-template>
     </ng-container>
   </nav>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
index f7b35a2d645..7e5b064f379 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
@@ -118,9 +118,7 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
           component.subsystemForm.setErrors({ cdSubmitButton: true });
         },
         complete: () => {
-          this.router.navigate([this.pageURL, { outlets: { modal: null } }], {
-            queryParams: { group: this.group }
-          });
+          this.router.navigate([this.pageURL, { outlets: { modal: null } }]);
         }
       });
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts
index 61e28274048..269e427be50 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.ts
@@ -79,25 +79,8 @@ export class NvmeofSubsystemsComponent extends ListWithDetails implements OnInit
           this.router.navigate([BASE_URL, { outlets: { modal: [URLVerbs.CREATE] } }], {
             queryParams: { group: this.group }
           }),
-        canBePrimary: (selection: CdTableSelection) => !selection.hasSelection
-      },
-      {
-        name: this.actionLabels.EDIT,
-        permission: 'update',
-        icon: Icons.edit,
-        click: () =>
-          this.router.navigate([
-            BASE_URL,
-            {
-              outlets: {
-                modal: [
-                  URLVerbs.EDIT,
-                  this.selection.first().nqn,
-                  this.selection.first().max_namespaces
-                ]
-              }
-            }
-          ])
+        canBePrimary: (selection: CdTableSelection) => !selection.hasSelection,
+        disable: () => !this.group
       },
       {
         name: this.actionLabels.DELETE,
@@ -114,12 +97,16 @@ export class NvmeofSubsystemsComponent extends ListWithDetails implements OnInit
   }
 
   getSubsystems() {
-    this.nvmeofService
-      .listSubsystems(this.group)
-      .subscribe((subsystems: NvmeofSubsystem[] | NvmeofSubsystem) => {
-        if (Array.isArray(subsystems)) this.subsystems = subsystems;
-        else this.subsystems = [subsystems];
-      });
+    if (this.group) {
+      this.nvmeofService
+        .listSubsystems(this.group)
+        .subscribe((subsystems: NvmeofSubsystem[] | NvmeofSubsystem) => {
+          if (Array.isArray(subsystems)) this.subsystems = subsystems;
+          else this.subsystems = [subsystems];
+        });
+    } else {
+      this.subsystems = [];
+    }
   }
 
   deleteSubsystemModal() {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-contants.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-contants.ts
new file mode 100644
index 00000000000..c5b25191594
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-contants.ts
@@ -0,0 +1,14 @@
+export const RBDActionHelpers = {
+  moveToTrash: $localize`Move an image to the trash. Images, even ones actively in-use by clones, can be moved to the trash and deleted at a later time.`,
+  delete: $localize`Delete an rbd image (including all data blocks). If the image has snapshots, this fails and nothing is deleted.`,
+  copy: $localize`Copy the content of a source image into the newly created destination image`,
+  flatten: $localize`If the image is a clone, copy all shared blocks from the parent snapshot and make the child independent of the parent, severing the link between parent snap and child. `,
+  enableMirroring: $localize`Mirroring needs to be enabled on the image to perform this action`,
+  clonedSnapshot: $localize`This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD`,
+  secondayImageDelete: $localize`The image cannot be deleted as it is secondary`,
+  primaryImageResync: $localize`Primary RBD images cannot be resynced`,
+  invalidNameDisable: $localize`This RBD image has an invalid name and can't be managed by ceph.`,
+  removingStatus: $localize`Action not possible for an RBD in status 'Removing'`,
+  journalTooltipText: $localize`'Ensures reliable replication by logging changes before updating the image, but doubles write time, impacting performance. Not recommended for high-speed data processing tasks.`,
+  snapshotTooltipText: $localize`This mode replicates RBD images between clusters using snapshots, efficiently copying data changes but requiring complete delta syncing during failover. Ideal for less demanding tasks due to its less granular approach compared to journaling.`
+};
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
index 85c56cbf0d4..29a2008567e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
@@ -7,7 +7,12 @@
           novalidate>
 
       <div i18n="form title"
-           class="form-header">{{ action | titlecase }} {{ resource | upperFirst }}</div>
+           class="form-header">{{ action | titlecase }} {{ resource | upperFirst }}
+        <cd-help-text>
+          <div *ngIf="action === 'Copy'">{{copyMessage}}
+          </div>
+        </cd-help-text>
+      </div>
 
       <!-- Parent -->
       <div class="form-item"
@@ -103,7 +108,7 @@
         <cd-alert-panel *ngIf="showMirrorDisableMessage"
                         spacingClass="mt-2"
                         [showTitle]="false"
-                        type="info">Mirroring can not be disabled on <b>Pool</b> mirror mode.
+                        type="info">Mirroring can not be disabled on <b>&nbsp;Pool&nbsp;</b> mirror mode.
                                     You need to change the mirror mode to enable this option.
         </cd-alert-panel>
         <cd-alert-panel *ngIf="currentPoolMirrorMode === 'disabled'"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts
index d9c1c8925fc..7d694e2cab4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.ts
@@ -34,6 +34,7 @@ import { RbdFormEditRequestModel } from './rbd-form-edit-request.model';
 import { RbdFormMode } from './rbd-form-mode.enum';
 import { RbdFormResponseModel } from './rbd-form-response.model';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
+import { RBDActionHelpers } from '../rbd-contants';
 
 class ExternalData {
   rbd: RbdFormResponseModel;
@@ -69,34 +70,28 @@ export class RbdFormComponent extends CdForm implements OnInit {
 
   pool: string;
   peerConfigured = false;
-
   advancedEnabled = false;
-
   public rbdFormMode = RbdFormMode;
   mode: RbdFormMode;
-
   response: RbdFormResponseModel;
   snapName: string;
-
   defaultObjectSize = '4 MiB';
 
   mirroringOptions = [
     {
       value: 'journal',
-      text:
-        'Ensures reliable replication by logging changes before updating the image, but doubles write time, impacting performance. Not recommended for high-speed data processing tasks.'
+      text: RBDActionHelpers.journalTooltipText
     },
     {
       value: 'snapshot',
-      text:
-        'This mode replicates RBD images between clusters using snapshots, efficiently copying data changes but requiring complete delta syncing during failover. Ideal for less demanding tasks due to its less granular approach compared to journaling.'
+      text: RBDActionHelpers.snapshotTooltipText
     }
   ];
   poolMirrorMode: string;
   mirroring = false;
   currentPoolName = '';
   currentPoolMirrorMode = '';
-
+  copyMessage: string = RBDActionHelpers.copy;
   objectSizes: Array<string> = [
     '4 KiB',
     '8 KiB',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts
index d71027bde3d..c775333a407 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.spec.ts
@@ -128,7 +128,7 @@ describe('RbdListComponent', () => {
         ]
       });
       expect(component.getDeleteDisableDesc(component.selection)).toBe(
-        'This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD.'
+        'This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD'
       );
     });
 
@@ -268,11 +268,11 @@ describe('RbdListComponent', () => {
           'Copy',
           'Flatten',
           'Resync',
-          'Delete',
-          'Move to Trash',
           'Remove Scheduling',
           'Promote',
-          'Demote'
+          'Demote',
+          'Move to Trash',
+          'Delete'
         ],
         primary: {
           multiple: 'Create',
@@ -300,7 +300,7 @@ describe('RbdListComponent', () => {
         }
       },
       'create,delete': {
-        actions: ['Create', 'Copy', 'Delete', 'Move to Trash'],
+        actions: ['Create', 'Copy', 'Move to Trash', 'Delete'],
         primary: {
           multiple: 'Create',
           executing: 'Create',
@@ -322,11 +322,11 @@ describe('RbdListComponent', () => {
           'Edit',
           'Flatten',
           'Resync',
-          'Delete',
-          'Move to Trash',
           'Remove Scheduling',
           'Promote',
-          'Demote'
+          'Demote',
+          'Move to Trash',
+          'Delete'
         ],
         primary: {
           multiple: '',
@@ -345,7 +345,7 @@ describe('RbdListComponent', () => {
         }
       },
       delete: {
-        actions: ['Delete', 'Move to Trash'],
+        actions: ['Move to Trash', 'Delete'],
         primary: {
           multiple: '',
           executing: '',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts
index 1a4bb4e0cf8..52d9ff819e2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-list/rbd-list.component.ts
@@ -23,7 +23,6 @@ import { DimlessBinaryPipe } from '~/app/shared/pipes/dimless-binary.pipe';
 import { DimlessPipe } from '~/app/shared/pipes/dimless.pipe';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { CdTableServerSideService } from '~/app/shared/services/cd-table-server-side.service';
-// import { ModalService } from '~/app/shared/services/modal.service';
 import { TaskListService } from '~/app/shared/services/task-list.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { URLBuilderService } from '~/app/shared/services/url-builder.service';
@@ -32,7 +31,7 @@ import { RbdParentModel } from '../rbd-form/rbd-parent.model';
 import { RbdTrashMoveModalComponent } from '../rbd-trash-move-modal/rbd-trash-move-modal.component';
 import { RBDImageFormat, RbdModel } from './rbd-model';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
-
+import { RBDActionHelpers } from '../rbd-contants';
 const BASE_URL = 'block/rbd';
 
 @Component({
@@ -83,7 +82,6 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
   count = 0;
   private tableContext: CdTableFetchDataContext = null;
   errorMessage: string;
-
   builders = {
     'rbd/create': (metadata: object) =>
       this.createRbdFromTask(metadata['pool_name'], metadata['namespace'], metadata['image_name']),
@@ -159,8 +157,20 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       icon: Icons.destroy,
       click: () => this.deleteRbdModal(),
       name: this.actionLabels.DELETE,
+      title: RBDActionHelpers.delete,
       disable: (selection: CdTableSelection) => this.getDeleteDisableDesc(selection)
     };
+    const moveAction: CdTableAction = {
+      permission: 'delete',
+      icon: Icons.trash,
+      title: RBDActionHelpers.moveToTrash,
+      click: () => this.trashRbdModal(),
+      name: this.actionLabels.TRASH,
+      disable: (selection: CdTableSelection) =>
+        this.getRemovingStatusDesc(selection) ||
+        this.getInvalidNameDisable(selection) ||
+        selection.first().image_format === RBDImageFormat.V1
+    };
     const resyncAction: CdTableAction = {
       permission: 'update',
       icon: Icons.refresh,
@@ -177,7 +187,8 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
         !!selection.first().cdExecuting,
       icon: Icons.copy,
       routerLink: () => `/block/rbd/copy/${getImageUri()}`,
-      name: this.actionLabels.COPY
+      name: this.actionLabels.COPY,
+      title: RBDActionHelpers.copy
     };
     const flattenAction: CdTableAction = {
       permission: 'update',
@@ -188,18 +199,10 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
         !selection.first().parent,
       icon: Icons.flatten,
       click: () => this.flattenRbdModal(),
-      name: this.actionLabels.FLATTEN
-    };
-    const moveAction: CdTableAction = {
-      permission: 'delete',
-      icon: Icons.trash,
-      click: () => this.trashRbdModal(),
-      name: this.actionLabels.TRASH,
-      disable: (selection: CdTableSelection) =>
-        this.getRemovingStatusDesc(selection) ||
-        this.getInvalidNameDisable(selection) ||
-        selection.first().image_format === RBDImageFormat.V1
+      name: this.actionLabels.FLATTEN,
+      title: RBDActionHelpers.flatten
     };
+
     const removeSchedulingAction: CdTableAction = {
       permission: 'update',
       icon: Icons.edit,
@@ -217,9 +220,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       name: this.actionLabels.PROMOTE,
       visible: () => this.selection.first() != null && !this.selection.first().primary,
       disable: () =>
-        this.selection.first().mirror_mode === 'Disabled'
-          ? 'Mirroring needs to be enabled on the image to perform this action'
-          : ''
+        this.selection.first().mirror_mode === 'Disabled' ? RBDActionHelpers.enableMirroring : ''
     };
     const demoteAction: CdTableAction = {
       permission: 'update',
@@ -228,9 +229,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       name: this.actionLabels.DEMOTE,
       visible: () => this.selection.first() != null && this.selection.first().primary,
       disable: () =>
-        this.selection.first().mirror_mode === 'Disabled'
-          ? 'Mirroring needs to be enabled on the image to perform this action'
-          : ''
+        this.selection.first().mirror_mode === 'Disabled' ? RBDActionHelpers.enableMirroring : ''
     };
     this.tableActions = [
       addAction,
@@ -238,11 +237,11 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
       copyAction,
       flattenAction,
       resyncAction,
-      deleteAction,
-      moveAction,
       removeSchedulingAction,
       promoteAction,
-      demoteAction
+      demoteAction,
+      moveAction,
+      deleteAction
     ];
   }
 
@@ -624,17 +623,23 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
     const first = selection.first();
 
     if (first && this.hasClonedSnapshots(first)) {
-      return $localize`This RBD has cloned snapshots. Please delete related RBDs before deleting this RBD.`;
+      return RBDActionHelpers.clonedSnapshot;
     }
-
-    return this.getInvalidNameDisable(selection) || this.hasClonedSnapshots(selection.first());
+    if (first && first.primary === false) {
+      return RBDActionHelpers.secondayImageDelete;
+    }
+    return (
+      this.getInvalidNameDisable(selection) ||
+      this.hasClonedSnapshots(selection.first()) ||
+      first.primary === false
+    );
   }
 
   getResyncDisableDesc(selection: CdTableSelection): string | boolean {
     const first = selection.first();
 
     if (first && this.imageIsPrimary(first)) {
-      return $localize`Primary RBD images cannot be resynced`;
+      return RBDActionHelpers.primaryImageResync;
     }
 
     return this.getInvalidNameDisable(selection);
@@ -647,7 +652,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
     const first = selection.first();
 
     if (first?.name?.match(/[@/]/)) {
-      return $localize`This RBD image has an invalid name and can't be managed by ceph.`;
+      return RBDActionHelpers.invalidNameDisable;
     }
 
     return !selection.first() || !selection.hasSingleSelection;
@@ -656,7 +661,7 @@ export class RbdListComponent extends ListWithDetails implements OnInit {
   getRemovingStatusDesc(selection: CdTableSelection): string | boolean {
     const first = selection.first();
     if (first?.source === 'REMOVING') {
-      return $localize`Action not possible for an RBD in status 'Removing'`;
+      return RBDActionHelpers.removingStatus;
     }
     return false;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts
index da1a3f355c7..d79a4a6ccad 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-snapshotschedule-form/cephfs-snapshotschedule-form.component.ts
@@ -19,6 +19,7 @@ import { CephfsSnapshotScheduleService } from '~/app/shared/api/cephfs-snapshot-
 import { CephfsSubvolumeService } from '~/app/shared/api/cephfs-subvolume.service';
 import { DirectoryStoreService } from '~/app/shared/api/directory-store.service';
 import { ActionLabelsI18n, URLVerbs } from '~/app/shared/constants/app.constants';
+import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs.constant';
 import { Icons } from '~/app/shared/enum/icons.enum';
 import { RepeatFrequency } from '~/app/shared/enum/repeat-frequency.enum';
 import { RetentionFrequency } from '~/app/shared/enum/retention-frequency.enum';
@@ -35,7 +36,6 @@ import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 
 const VALIDATON_TIMER = 300;
 const DEBOUNCE_TIMER = 300;
-const DEFAULT_SUBVOLUME_GROUP = '_nogroup';
 
 @Component({
   selector: 'cd-cephfs-snapshotschedule-form',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts
index b5eb7a88681..be7b81940df 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-list/cephfs-subvolume-list.component.ts
@@ -34,8 +34,7 @@ import { CephfsMountDetailsComponent } from '../cephfs-mount-details/cephfs-moun
 import { HealthService } from '~/app/shared/api/health.service';
 import _ from 'lodash';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
-
-const DEFAULT_SUBVOLUME_GROUP = '_nogroup';
+import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs.constant';
 
 @Component({
   selector: 'cd-cephfs-subvolume-list',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts
index 92f337f85a6..0087ffd66cd 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-subvolume-snapshots-list/cephfs-subvolume-snapshots-list.component.ts
@@ -26,7 +26,7 @@ import moment from 'moment';
 import { Validators } from '@angular/forms';
 import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
-import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs';
+import { DEFAULT_SUBVOLUME_GROUP } from '~/app/shared/constants/cephfs.constant';
 
 @Component({
   selector: 'cd-cephfs-subvolume-snapshots-list',
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts
index dc61dc5ab67..b6ae76a66be 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/cluster.module.ts
@@ -9,7 +9,9 @@ import {
   CheckboxModule,
   ButtonModule,
   GridModule,
-  ProgressIndicatorModule
+  ProgressIndicatorModule,
+  InputModule,
+  ModalModule
 } from 'carbon-components-angular';
 
 import { TreeModule } from '@circlon/angular-tree-component';
@@ -102,7 +104,9 @@ import { MultiClusterDetailsComponent } from './multi-cluster/multi-cluster-deta
     CheckboxModule,
     GridModule,
     ProgressIndicatorModule,
-    ButtonModule
+    ButtonModule,
+    InputModule,
+    ModalModule
   ],
   declarations: [
     HostsComponent,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html
index af09b9a4fef..9b751d69c5a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.html
@@ -1,108 +1,104 @@
-<cd-modal [pageURL]="pageURL"
-          [modalRef]="activeModal">
-  <span class="modal-title"
-        i18n>{{ action | titlecase }} {{ resource | upperFirst }}</span>
-
-  <ng-container class="modal-content">
-
-    <div *cdFormLoading="loading">
-      <form name="hostForm"
-            #formDir="ngForm"
-            [formGroup]="hostForm"
-            novalidate>
-
-        <div class="modal-body">
-
-          <!-- Hostname -->
-          <div class="form-group row">
-            <label class="cd-col-form-label required"
-                   for="hostname">
-            <ng-container i18n>Hostname</ng-container>
-            <cd-helper>
-              <p i18n>To add multiple hosts at once, you can enter:</p>
-              <ul>
-                <li i18n>a comma-separated list of hostnames <samp>(e.g.: example-01,example-02,example-03)</samp>,</li>
-                <li i18n>a range expression <samp>(e.g.: example-[01-03].ceph)</samp>,</li>
-                <li i18n>a comma separated range expression <samp>(e.g.: example-[01-05].lab.com,example2-[1-4].lab.com,example3-[001-006].lab.com)</samp></li>
-              </ul>
-            </cd-helper>
-            </label>
-            <div class="cd-col-form-input">
-              <input class="form-control"
-                     type="text"
-                     placeholder="mon-123"
-                     id="hostname"
-                     name="hostname"
-                     formControlName="hostname"
-                     autofocus
-                     (keyup)="checkHostNameValue()">
-              <span class="invalid-feedback"
-                    *ngIf="hostForm.showError('hostname', formDir, 'required')"
-                    i18n>This field is required.</span>
-              <span class="invalid-feedback"
-                    *ngIf="hostForm.showError('hostname', formDir, 'uniqueName')"
-                    i18n>The chosen hostname is already in use.</span>
-            </div>
-          </div>
-
-          <!-- Address -->
-          <div class="form-group row"
-               *ngIf="!hostPattern">
-            <label class="cd-col-form-label"
-                   for="addr"
-                   i18n>Network address</label>
-            <div class="cd-col-form-input">
-              <input class="form-control"
-                     type="text"
-                     placeholder="192.168.0.1"
-                     id="addr"
-                     name="addr"
-                     formControlName="addr">
-              <span class="invalid-feedback"
-                    *ngIf="hostForm.showError('addr', formDir, 'pattern')"
-                    i18n>The value is not a valid IP address.</span>
-            </div>
-          </div>
-
-          <!-- Labels -->
-          <div class="form-group row">
-            <label i18n
-                   for="labels"
-                   class="cd-col-form-label">Labels</label>
-            <div class="cd-col-form-input">
-              <cd-select-badges id="labels"
-                                [data]="hostForm.controls.labels.value"
-                                [options]="labelsOption"
-                                [customBadges]="true"
-                                [messages]="messages">
-              </cd-select-badges>
-            </div>
-          </div>
-
-          <!-- Maintenance Mode -->
-          <div class="form-group row"
-               *ngIf="!hideMaintenance">
-            <div class="cd-col-form-offset">
-              <div class="custom-control custom-checkbox">
-                <input class="custom-control-input"
-                       id="maintenance"
-                       type="checkbox"
-                       formControlName="maintenance">
-                <label class="custom-control-label"
-                       for="maintenance"
-                       i18n>Maintenance Mode</label>
-              </div>
-            </div>
-          </div>
-        </div>
-
-        <div class="modal-footer">
-          <cd-form-button-panel (submitActionEvent)="submit()"
-                                [form]="hostForm"
-                                [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"
-                                wrappingClass="text-right"></cd-form-button-panel>
-        </div>
-      </form>
+<cds-modal size="md"
+           [open]="open"
+           [hasScrollingContent]="true"
+           (overlaySelected)="closeModal()">
+  <cds-modal-header (closeSelect)="closeModal()">
+    <h3 cdsModalHeaderHeading
+        i18n>{{ action | titlecase }} {{ resource | upperFirst }}</h3>
+  </cds-modal-header>
+  <ng-container *cdFormLoading="loading">
+  <form name="hostForm"
+        #formDir="ngForm"
+        [formGroup]="hostForm"
+        novalidate>
+    <div cdsModalContent>
+      <!-- Hostname -->
+      <div class="form-item">
+        <cds-text-label label="Hostname"
+                        for="hostname"
+                        cdRequiredField="Hostname"
+                        [invalid]="!hostForm.controls.hostname.valid && hostForm.controls.hostname.dirty"
+                        [invalidText]="hostnameError"
+                        i18n>Hostname
+          <input cdsText
+                 type="text"
+                 placeholder="mon-123"
+                 id="hostname"
+                 name="hostname"
+                 formControlName="hostname"
+                 autofocus
+                 (keyup)="checkHostNameValue()">
+        </cds-text-label>
+        <ng-template #hostnameError>
+          <span *ngIf="hostForm.showError('hostname', formDir, 'required')"
+                class="invalid-feedback">
+            <ng-container i18n> This field is required. </ng-container>
+          </span>
+          <span *ngIf="hostForm.showError('hostname', formDir, 'uniqueName')"
+                class="invalid-feedback">
+            <ng-container i18n> The chosen hostname is already in use. </ng-container>
+          </span>
+        </ng-template>
+        <cd-help-text>
+          To add multiple hosts at once, you can enter:
+          <ul>
+            <li>a comma-separated list of hostnames <samp>(e.g.: example-01,example-02,example-03)</samp>,</li>
+            <li>a range expression <samp>(e.g.: example-[01-03].ceph)</samp>,</li>
+            <li>a comma separated range expression <samp>(e.g.: example-[01-05].lab.com,example2-[1-4].lab.com,example3-[001-006].lab.com)</samp></li>
+          </ul>
+        </cd-help-text>
+      </div>
+      <!-- Address -->
+      <div class="form-item"
+           *ngIf="!hostPattern">
+        <cds-text-label label="Network address"
+                        for="addr"
+                        i18n>Network address
+          <input cdsText
+                 type="text"
+                 placeholder="192.168.0.1"
+                 id="addr"
+                 name="addr"
+                 formControlName="addr"/>
+        </cds-text-label>
+        <ng-template #hostaddrError>
+          <span *ngIf="hostForm.showError('addr', formDir, 'pattern')">
+            <ng-container i18n> The value is not a valid IP address. </ng-container>
+          </span>
+        </ng-template>
+      </div>
+      <!-- Labels -->
+      <div class="form-item">
+        <cds-combo-box label="Labels"
+                       type="multi"
+                       selectionFeedback="top-after-reopen"
+                       for="labels"
+                       name="labels"
+                       formControlName="labels"
+                       placeholder="Select Labels..."
+                       i18n-placeholder
+                       [appendInline]="true"
+                       [items]="labelsOption"
+                       itemValueKey="value"
+                       id="labels"
+                       i18n>
+          <cds-dropdown-list></cds-dropdown-list>
+        </cds-combo-box>
+      </div>
+      <!-- Maintenance Mode -->
+      <div *ngIf="!hideMaintenance">
+        <cds-checkbox id="maintenance"
+                      type="checkbox"
+                      formControlName="maintenance"
+                      i18n>Maintenance Mode
+        </cds-checkbox>
+      </div>
     </div>
-  </ng-container>
-</cd-modal>
+    <cd-form-button-panel (submitActionEvent)="submit()"
+                          [form]="hostForm"
+                          [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"
+                          [modalForm]="true">
+    </cd-form-button-panel>
+  </form>
+</ng-container>
+</cds-modal>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts
index ed85d96cb1b..8097bb26018 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.spec.ts
@@ -10,6 +10,7 @@ import { LoadingPanelComponent } from '~/app/shared/components/loading-panel/loa
 import { SharedModule } from '~/app/shared/shared.module';
 import { configureTestBed, FormHelper } from '~/testing/unit-test-helper';
 import { HostFormComponent } from './host-form.component';
+import { InputModule, ModalModule } from 'carbon-components-angular';
 
 describe('HostFormComponent', () => {
   let component: HostFormComponent;
@@ -23,7 +24,9 @@ describe('HostFormComponent', () => {
         HttpClientTestingModule,
         RouterTestingModule,
         ReactiveFormsModule,
-        ToastrModule.forRoot()
+        ToastrModule.forRoot(),
+        InputModule,
+        ModalModule
       ],
       declarations: [HostFormComponent],
       providers: [NgbActiveModal]
@@ -45,7 +48,7 @@ describe('HostFormComponent', () => {
 
   it('should open the form in a modal', () => {
     const nativeEl = fixture.debugElement.nativeElement;
-    expect(nativeEl.querySelector('cd-modal')).not.toBe(null);
+    expect(nativeEl.querySelector('cds-modal')).not.toBe(null);
   });
 
   it('should validate the network address is valid', fakeAsync(() => {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts
index 240a0a7bebb..166ab013e73 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/host-form/host-form.component.ts
@@ -1,8 +1,6 @@
 import { Component, OnInit } from '@angular/core';
 import { UntypedFormControl, Validators } from '@angular/forms';
-import { Router } from '@angular/router';
-
-import { NgbActiveModal } from '@ng-bootstrap/ng-bootstrap';
+import { ActivatedRoute, Router } from '@angular/router';
 import expand from 'brace-expansion';
 
 import { HostService } from '~/app/shared/api/host.service';
@@ -15,6 +13,7 @@ import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
 import { FinishedTask } from '~/app/shared/models/finished-task';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
+import { Location } from '@angular/common';
 
 @Component({
   selector: 'cd-host-form',
@@ -22,6 +21,7 @@ import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
   styleUrls: ['./host-form.component.scss']
 })
 export class HostFormComponent extends CdForm implements OnInit {
+  open: boolean = false;
   hostForm: CdFormGroup;
   action: string;
   resource: string;
@@ -46,7 +46,8 @@ export class HostFormComponent extends CdForm implements OnInit {
     private actionLabels: ActionLabelsI18n,
     private hostService: HostService,
     private taskWrapper: TaskWrapperService,
-    public activeModal: NgbActiveModal
+    private route: ActivatedRoute,
+    private location: Location
   ) {
     super();
     this.resource = $localize`host`;
@@ -54,9 +55,7 @@ export class HostFormComponent extends CdForm implements OnInit {
   }
 
   ngOnInit() {
-    if (this.router.url.includes('hosts')) {
-      this.pageURL = 'hosts';
-    }
+    this.open = this.route.outlet === 'modal';
     this.createForm();
     const hostContext = new CdTableFetchDataContext(() => undefined);
     this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: any[]) => {
@@ -69,7 +68,7 @@ export class HostFormComponent extends CdForm implements OnInit {
     this.hostService.getLabels().subscribe((resp: string[]) => {
       const uniqueLabels = new Set(resp.concat(this.hostService.predefinedLabels));
       this.labelsOption = Array.from(uniqueLabels).map((label) => {
-        return { enabled: true, name: label, selected: false, description: null };
+        return { enabled: true, name: label, content: label, selected: false, description: null };
       });
     });
   }
@@ -94,7 +93,7 @@ export class HostFormComponent extends CdForm implements OnInit {
         validators: [CdValidators.ip()]
       }),
       labels: new UntypedFormControl([]),
-      maintenance: new UntypedFormControl(false)
+      maintenance: new UntypedFormControl()
     });
   }
 
@@ -166,9 +165,13 @@ export class HostFormComponent extends CdForm implements OnInit {
           complete: () => {
             this.pageURL === 'hosts'
               ? this.router.navigate([this.pageURL, { outlets: { modal: null } }])
-              : this.activeModal.close();
+              : this.location.back();
           }
         });
     });
   }
+
+  closeModal(): void {
+    this.location.back();
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts
index adb89e6cd5c..c26d24177fd 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.ts
@@ -29,7 +29,6 @@ import { Permissions } from '~/app/shared/models/permissions';
 import { EmptyPipe } from '~/app/shared/pipes/empty.pipe';
 import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
 import { CdTableServerSideService } from '~/app/shared/services/cd-table-server-side.service';
-import { ModalService } from '~/app/shared/services/modal.service';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
 import { URLBuilderService } from '~/app/shared/services/url-builder.service';
@@ -125,7 +124,6 @@ export class HostsComponent extends ListWithDetails implements OnDestroy, OnInit
     private emptyPipe: EmptyPipe,
     private hostService: HostService,
     private actionLabels: ActionLabelsI18n,
-    private modalService: ModalService,
     private taskWrapper: TaskWrapperService,
     private router: Router,
     private notificationService: NotificationService,
@@ -153,7 +151,7 @@ export class HostsComponent extends ListWithDetails implements OnDestroy, OnInit
         click: () =>
           this.router.url.includes('/hosts')
             ? this.router.navigate([BASE_URL, { outlets: { modal: [URLVerbs.ADD] } }])
-            : (this.bsModalRef = this.modalService.show(HostFormComponent, {
+            : (this.bsModalRef = this.cdsModalService.show(HostFormComponent, {
                 hideMaintenance: this.hideMaintenance
               })),
         disable: (selection: CdTableSelection) => this.getDisable('add', selection)
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html
index a9961f72ff6..b05d07fb31b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.html
@@ -34,6 +34,7 @@
                     [columns]="columns"
                     columnMode="flex"
                     selectionType="single"
+                    (fetchData)="refresh()"
                     [hasDetails]="true"
                     (setExpandedRow)="setExpandedRow($event)"
                     [maxLimit]="25"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
index 9f05ab668ab..78b4c9c1859 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/multi-cluster/multi-cluster-list/multi-cluster-list.component.ts
@@ -273,4 +273,9 @@ export class MultiClusterListComponent extends ListWithDetails implements OnInit
     super.setExpandedRow(expandedRow);
     this.router.navigate(['performance-details'], { relativeTo: this.route });
   }
+
+  refresh() {
+    this.multiClusterService.refresh();
+    this.multiClusterService.refreshTokenStatus();
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
index 5f5f91dd0ed..a56877512f9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.html
@@ -6,13 +6,15 @@
        i18n>OSDs List</a>
     <ng-template ngbNavContent>
       <cd-table [data]="osds"
-                (fetchData)="getOsdList()"
+                (fetchData)="getOsdList($event)"
                 [columns]="columns"
                 selectionType="multiClick"
                 [hasDetails]="true"
                 (setExpandedRow)="setExpandedRow($event)"
                 (updateSelection)="updateSelection($event)"
-                [updateSelectionOnRefresh]="'never'">
+                [updateSelectionOnRefresh]="'never'"
+                [serverSide]="true"
+                [count]="count">
 
         <div class="table-actions">
           <cd-table-actions [permission]="permissions.osd"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
index 77facfe3f85..85ea9240414 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.spec.ts
@@ -33,6 +33,8 @@ import {
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdListComponent } from './osd-list.component';
 import { ResizeObserver as ResizeObserverPolyfill } from '@juggle/resize-observer';
+import { PaginateObservable } from '~/app/shared/api/paginate.model';
+import { Osd } from '~/app/shared/models/osd.model';
 
 describe('OsdListComponent', () => {
   let component: OsdListComponent;
@@ -141,38 +143,42 @@ describe('OsdListComponent', () => {
   });
 
   describe('getOsdList', () => {
-    let osds: any[];
+    let osds: Osd[];
     let flagsSpy: jasmine.Spy;
 
-    const createOsd = (n: number) =>
-      <Record<string, any>>{
-        in: 'in',
-        up: 'up',
-        tree: {
-          device_class: 'ssd'
-        },
-        stats_history: {
-          op_out_bytes: [
-            [n, n],
-            [n * 2, n * 2]
-          ],
-          op_in_bytes: [
-            [n * 3, n * 3],
-            [n * 4, n * 4]
-          ]
-        },
-        stats: {
-          stat_bytes_used: n * n,
-          stat_bytes: n * n * n
-        },
-        state: []
-      };
+    const createOsd = (n: number): Osd => ({
+      id: n,
+      host: {
+        id: 0,
+        name: 'test_host'
+      },
+      in: 1,
+      up: 1,
+      tree: {
+        device_class: 'ssd'
+      },
+      stats_history: {
+        op_out_bytes: [
+          [n, n],
+          [n * 2, n * 2]
+        ],
+        op_in_bytes: [
+          [n * 3, n * 3],
+          [n * 4, n * 4]
+        ]
+      },
+      stats: {
+        stat_bytes_used: n * n,
+        stat_bytes: n * n * n
+      },
+      state: []
+    });
 
     const expectAttributeOnEveryOsd = (attr: string) =>
       expect(component.osds.every((osd) => Boolean(_.get(osd, attr)))).toBeTruthy();
 
     beforeEach(() => {
-      spyOn(osdService, 'getList').and.callFake(() => of(osds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(osds)));
       flagsSpy = spyOn(osdService, 'getFlags').and.callFake(() => of([]));
       osds = [createOsd(1), createOsd(2), createOsd(3)];
       component.getOsdList();
@@ -556,8 +562,9 @@ describe('OsdListComponent', () => {
 
     beforeEach(() => {
       component.permissions = fakeAuthStorageService.getPermissions();
-      spyOn(osdService, 'getList').and.callFake(() => of(fakeOsds));
+      spyOn(osdService, 'getList').and.callFake(() => new PaginateObservable<Osd[]>(of(fakeOsds)));
       spyOn(osdService, 'getFlags').and.callFake(() => of([]));
+      component.getOsdList();
     });
 
     const testTableActions = async (
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
index 103b61e79f0..91cb0193f3c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/osd/osd-list/osd-list.component.ts
@@ -39,6 +39,8 @@ import { OsdRecvSpeedModalComponent } from '../osd-recv-speed-modal/osd-recv-spe
 import { OsdReweightModalComponent } from '../osd-reweight-modal/osd-reweight-modal.component';
 import { OsdScrubModalComponent } from '../osd-scrub-modal/osd-scrub-modal.component';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
+import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
+import { Osd } from '~/app/shared/models/osd.model';
 
 const BASE_URL = 'osd';
 
@@ -71,6 +73,7 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
   clusterWideActions: CdTableAction[];
   icons = Icons;
   osdSettings = new OsdSettings();
+  count = 0;
 
   selection = new CdTableSelection();
   osds: any[] = [];
@@ -426,10 +429,13 @@ export class OsdListComponent extends ListWithDetails implements OnInit {
     }
   }
 
-  getOsdList() {
-    const observables = [this.osdService.getList(), this.osdService.getFlags()];
-    observableForkJoin(observables).subscribe((resp: [any[], string[]]) => {
-      this.osds = resp[0].map((osd) => {
+  getOsdList(context?: CdTableFetchDataContext) {
+    if (!context) context = new CdTableFetchDataContext();
+    const pagination_obs = this.osdService.getList(context.toParams());
+    const observables = [pagination_obs.observable, this.osdService.getFlags()];
+    observableForkJoin(observables).subscribe((resp: any) => {
+      this.osds = resp[0].map((osd: Osd) => {
+        this.count = pagination_obs.count;
         osd.collectedStates = OsdListComponent.collectStates(osd);
         osd.stats_history.out_bytes = osd.stats_history.op_out_bytes.map((i: string) => i[1]);
         osd.stats_history.in_bytes = osd.stats_history.op_in_bytes.map((i: string) => i[1]);
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
index d3ea8c018f6..367418c752e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-daemon-list/service-daemon-list.component.spec.ts
@@ -27,7 +27,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '003c10beafc8c27b635bcdfed1ed832e4c1005be89bb1bb05ad4cc6c2b98e41b',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '3',
       daemon_type: 'osd',
       daemon_name: 'osd.3',
@@ -47,7 +47,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: 'baeec41a01374b3ed41016d542d19aef4a70d69c27274f271e26381a0cc58e7a',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '4',
       daemon_type: 'osd',
       daemon_name: 'osd.4',
@@ -63,7 +63,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'osd0',
       container_id: '8483de277e365bea4365cee9e1f26606be85c471e4da5d51f57e4b85a42c616e',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: '5',
       daemon_type: 'osd',
       daemon_name: 'osd.5',
@@ -79,7 +79,7 @@ describe('ServiceDaemonListComponent', () => {
       hostname: 'mon0',
       container_id: '6ca0574f47e300a6979eaf4e7c283a8c4325c2235ae60358482fc4cd58844a21',
       container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-      container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+      container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
       daemon_id: 'a',
       daemon_name: 'mon.a',
       daemon_type: 'mon',
@@ -99,7 +99,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'osd',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 3,
         running: 3,
         last_refresh: '2020-02-25T04:33:26.465699'
@@ -111,7 +111,7 @@ describe('ServiceDaemonListComponent', () => {
       service_name: 'crash',
       status: {
         container_image_id: 'e70344c77bcbf3ee389b9bf5128f635cf95f3d59e005c5d8e67fc19bcc74ed23',
-        container_image_name: 'docker.io/ceph/daemon-base:latest-master-devel',
+        container_image_name: 'quay.io/ceph/daemon-base:latest-master-devel',
         size: 1,
         running: 1,
         last_refresh: '2020-02-25T04:33:26.465766'
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
index 1a73490175d..0da4913e9b8 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.html
@@ -106,7 +106,6 @@
                       [invalid]="nfsForm.controls.fsal.controls.user_id.invalid && (nfsForm.controls.fsal.controls.user_id.dirty)"
                       [invalidText]="userIdError"
                       [skeleton]="allRGWUsers === null"
-                      (valueChange)="pathChangeHandler()"
                       i18n>
             <option *ngIf="allRGWUsers === null"
                     value="">Loading...</option>
@@ -223,8 +222,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="pathDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #pathError>
@@ -259,8 +256,6 @@
                  name="path"
                  formControlName="path"
                  [ngbTypeahead]="bucketDataSource"
-                 (selectItem)="pathChangeHandler()"
-                 (blur)="pathChangeHandler()"
                  [invalid]="nfsForm.controls.path.invalid && (nfsForm.controls.path.dirty)">
         </cds-text-label>
         <ng-template #bucketPathError>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
index 2317671b022..d502524256e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/nfs/nfs-form/nfs-form.component.ts
@@ -434,7 +434,7 @@ export class NfsFormComponent extends CdForm implements OnInit {
           fs_name: this.selectedFsName
         }
       });
-      this.volumeChangeHandler();
+      this.getSubVolGrp(this.selectedFsName);
     }
     if (!_.isEmpty(this.selectedSubvolGroup)) {
       this.nfsForm.patchValue({
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
index ddc202152b9..463eac88b1e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-details/rgw-bucket-details.component.html
@@ -158,8 +158,14 @@
                   </div>
                 </td>
                 <td>
-                  <pre *ngIf="lifecycleFormat === 'json'">{{selection.lifecycle | json}}</pre>
-                  <pre *ngIf="lifecycleFormat === 'xml'">{{ (selection.lifecycle | xml) || '-'}}</pre>
+                  <cds-code-snippet display="multi"
+                                    *ngIf="lifecycleFormat === 'json'">
+                    {{selection.lifecycle | json}}
+                  </cds-code-snippet>
+                  <cds-code-snippet display="multi"
+                                    *ngIf="lifecycleFormat === 'xml'">
+                    {{ (selection.lifecycle | xml:{'Rules':'Rule'}) || '-'}}
+                  </cds-code-snippet>
                 </td>
               </tr>
               <tr>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html
index f77526be779..9c07182a0e5 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.html
@@ -91,6 +91,14 @@
             <span class="invalid-feedback"
                   *ngIf="bucketForm.showError('owner', frm, 'required')"
                   i18n>This field is required.</span>
+            <cd-alert-panel
+              type="info"
+              *ngIf="bucketForm.get('owner').disabled"
+              spacingClass="me-1 mt-1"
+              i18n>
+                The bucket is owned by an account. UI does not support changing
+                the ownership of bucket owned by an account.
+            </cd-alert-panel>
           </div>
         </div>
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts
index d82c71e3cf7..53a1ac442c5 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-bucket-form/rgw-bucket-form.component.ts
@@ -269,6 +269,14 @@ export class RgwBucketFormComponent extends CdForm implements OnInit, AfterViewC
           }
           this.bucketForm.setValue(value);
           if (this.editing) {
+            // temporary fix until the s3 account management is implemented in
+            // the frontend. Disable changing the owner of the bucket in case
+            // its owned by the account.
+            // @TODO: Introduce account selection for a bucket.
+            if (!this.owners.includes(value['owner'])) {
+              this.owners.push(value['owner']);
+              this.bucketForm.get('owner').disable();
+            }
             this.isVersioningAlreadyEnabled = this.isVersioningEnabled;
             this.isMfaDeleteAlreadyEnabled = this.isMfaDeleteEnabled;
             this.setMfaDeleteValidators();
@@ -327,11 +335,15 @@ export class RgwBucketFormComponent extends CdForm implements OnInit, AfterViewC
       // Edit
       const versioning = this.getVersioningStatus();
       const mfaDelete = this.getMfaDeleteStatus();
+      // make the owner empty if the field is disabled.
+      // this ensures the bucket doesn't gets updated with owner when
+      // the bucket is owned by the account.
+      const owner = this.bucketForm.get('owner').disabled === true ? '' : values['owner'];
       this.rgwBucketService
         .update(
           values['bid'],
           values['id'],
-          values['owner'],
+          owner,
           versioning,
           values['encryption_enabled'],
           values['encryption_type'],
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
index fbe3110b978..67c98b0a59f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-details/rgw-multisite-details.component.ts
@@ -40,6 +40,7 @@ import { Router } from '@angular/router';
 import { RgwMultisiteWizardComponent } from '../rgw-multisite-wizard/rgw-multisite-wizard.component';
 import { RgwMultisiteSyncPolicyComponent } from '../rgw-multisite-sync-policy/rgw-multisite-sync-policy.component';
 import { ModalCdsService } from '~/app/shared/services/modal-cds.service';
+import { RgwMultisiteService } from '~/app/shared/api/rgw-multisite.service';
 
 const BASE_URL = 'rgw/multisite/configuration';
 
@@ -121,7 +122,8 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
     public rgwDaemonService: RgwDaemonService,
     public mgrModuleService: MgrModuleService,
     private notificationService: NotificationService,
-    private cdsModalService: ModalCdsService
+    private cdsModalService: ModalCdsService,
+    private rgwMultisiteService: RgwMultisiteService
   ) {
     this.permission = this.authStorageService.getPermissions().rgw;
   }
@@ -137,9 +139,7 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
       multisiteInfo: this.multisiteInfo
     };
     if (entityName === 'realm') {
-      this.bsModalRef = this.modalService.show(RgwMultisiteRealmFormComponent, initialState, {
-        size: 'lg'
-      });
+      this.bsModalRef = this.cdsModalService.show(RgwMultisiteRealmFormComponent, initialState);
     } else if (entityName === 'zonegroup') {
       this.bsModalRef = this.modalService.show(RgwMultisiteZonegroupFormComponent, initialState, {
         size: 'lg'
@@ -412,22 +412,30 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
     this.realmIds = [];
     this.zoneIds = [];
     this.evaluateMigrateAndReplicationActions();
+    this.rgwMultisiteService.restartGatewayMessage$.subscribe((value) => {
+      if (value !== null) {
+        this.restartGatewayMessage = value;
+      } else {
+        this.checkRestartGatewayMessage();
+      }
+    });
+    return allNodes;
+  }
+
+  checkRestartGatewayMessage() {
     this.rgwDaemonService.list().subscribe((data: any) => {
-      const hasEmptyRealmName = data.some(
-        (item: { [x: string]: any }) =>
-          item['realm_name'] === '' &&
-          !data.some((i: { [x: string]: any }) => i['id'] === item['id'] && i['realm_name'] !== '')
-      );
+      const realmName = data.map((item: { [x: string]: any }) => item['realm_name']);
       if (
         this.defaultRealmId !== '' &&
         this.defaultZonegroupId !== '' &&
         this.defaultZoneId !== '' &&
-        hasEmptyRealmName
+        realmName.includes('')
       ) {
         this.restartGatewayMessage = true;
+      } else {
+        this.restartGatewayMessage = false;
       }
     });
-    return allNodes;
   }
 
   getDefaultsEntities(
@@ -546,20 +554,20 @@ export class RgwMultisiteDetailsComponent implements OnDestroy, OnInit {
 
   delete(node: TreeNode) {
     if (node.data.type === 'realm') {
-      this.modalRef = this.modalService.show(CriticalConfirmationModalComponent, {
+      const modalRef = this.cdsModalService.show(CriticalConfirmationModalComponent, {
         itemDescription: $localize`${node.data.type} ${node.data.name}`,
         itemNames: [`${node.data.name}`],
         submitAction: () => {
           this.rgwRealmService.delete(node.data.name).subscribe(
             () => {
-              this.modalRef.close();
               this.notificationService.show(
                 NotificationType.success,
                 $localize`Realm: '${node.data.name}' deleted successfully`
               );
+              this.cdsModalService.dismissAll();
             },
             () => {
-              this.modalRef.componentInstance.stopLoadingSpinner();
+              this.cdsModalService.stopLoadingSpinner(modalRef.deletionForm);
             }
           );
         }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html
index 1fa5b08f60d..5ca36f4bd2f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.html
@@ -1,58 +1,91 @@
-<cd-modal [modalRef]="activeModal">
-  <ng-container i18n="form title"
-                class="modal-title">{{ action | titlecase }} {{ resource | upperFirst }}</ng-container>
+<cds-modal size="sm"
+           [open]="open"
+           [hasScrollingContent]="false"
+           (overlaySelected)="closeModal()">
+  <cds-modal-header (closeSelect)="closeModal()">
+    <h3 cdsModalHeaderHeading
+        i18n>{{ action | titlecase }} {{ resource | upperFirst }}</h3>
+  </cds-modal-header>
 
-  <ng-container class="modal-content">
+  <div cdsModalContent>
     <form name="multisiteRealmForm"
           #formDir="ngForm"
           [formGroup]="multisiteRealmForm"
           novalidate>
-    <div class="modal-body">
-      <div class="form-group row">
-        <label class="cd-col-form-label required"
-               for="realmName"
-               i18n>Realm Name</label>
-        <div class="cd-col-form-input">
-          <input class="form-control"
-                 type="text"
-                 placeholder="Realm name..."
-                 id="realmName"
-                 name="realmName"
-                 formControlName="realmName">
-          <span class="invalid-feedback"
-                *ngIf="multisiteRealmForm.showError('realmName', formDir, 'required')"
-                i18n>This field is required.</span>
-          <span class="invalid-feedback"
-                *ngIf="multisiteRealmForm.showError('realmName', formDir, 'uniqueName')"
-                i18n>The chosen realm name is already in use.</span>
-          <div class="custom-control custom-checkbox">
-            <input class="form-check-input"
-                   id="default_realm"
-                   name="default_realm"
-                   formControlName="default_realm"
-                   [attr.disabled]="action === 'edit' ? true: null"
-                   type="checkbox">
-            <label class="form-check-label"
-                   for="default_realm"
-                   i18n>Default</label>
-            <cd-helper *ngIf="action === 'edit' && info.data.is_default">
-              <span i18n>You cannot unset the default flag.</span>
-            </cd-helper>
-            <cd-helper *ngIf="action === 'edit' && !info.data.is_default">
-              <span i18n>Please consult the&nbsp;<a href="{{ docUrl }}">documentation</a>&nbsp;to follow the failover mechanism</span>
-            </cd-helper>
-            <cd-helper *ngIf="defaultRealmDisabled && action === 'create'">
-              <span i18n>Default realm already exists.</span>
-            </cd-helper>
-          </div>
-        </div>
+      <div class="form-item">
+        <cds-text-label
+          labelInputID="realmName"
+          [invalid]="
+            multisiteRealmForm.controls.realmName.invalid &&
+            (multisiteRealmForm.controls.realmName.touched ||
+              multisiteRealmForm.controls.realmName.dirty)
+          "
+          [invalidText]="realmNameError"
+          cdRequiredField="Realm Name"
+          i18n
+          >Realm Name
+          <input
+            cdsText
+            type="text"
+            placeholder="Realm name..."
+            id="realmName"
+            name="realmName"
+            formControlName="realmName"
+            [invalid]="
+              multisiteRealmForm.controls.realmName.invalid &&
+              (multisiteRealmForm.controls.realmName.touched ||
+                multisiteRealmForm.controls.realmName.dirty)
+            "
+            [autofocus]="true"
+            modal-primary-focus
+          />
+        </cds-text-label>
+        <ng-template #realmNameError>
+          <span
+            class="invalid-feedback"
+            *ngIf="multisiteRealmForm.showError('realmName', formDir, 'required')"
+            i18n
+            >This field is required.</span
+          >
+          <span
+            class="invalid-feedback"
+            *ngIf="multisiteRealmForm.showError('realmName', formDir, 'uniqueName')"
+            i18n
+            >The chosen realm name is already in use.</span
+          >
+        </ng-template>
+      </div>
+
+      <div class="form-item">
+        <cds-checkbox
+          label="Default"
+          for="default_realm"
+          formControlName="default_realm"
+          name="default_realm"
+          [disabled]="action === actionLabels.EDIT"
+          i18n
+          >Default
+          <cd-help-text *ngIf="action === actionLabels.EDIT && info.data.is_default">
+            <span>You cannot unset the default flag.</span>
+          </cd-help-text>
+          <cd-help-text *ngIf="action === actionLabels.EDIT && !info.data.is_default">
+            <span
+              >Please consult the <a href="{{ docUrl }}">documentation</a> to follow the failover
+              mechanism</span
+            >
+          </cd-help-text>
+          <cd-help-text *ngIf="defaultRealmDisabled && action === actionLabels.CREATE">
+            <span>Default realm already exists.</span>
+          </cd-help-text>
+        </cds-checkbox>
       </div>
-    </div>
-    <div class="modal-footer">
-      <cd-form-button-panel (submitActionEvent)="submit()"
-                            [form]="multisiteRealmForm"
-                            [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"></cd-form-button-panel>
-    </div>
     </form>
-  </ng-container>
-</cd-modal>
+  </div>
+  <cd-form-button-panel
+    (submitActionEvent)="submit()"
+    [form]="multisiteRealmForm"
+    [submitText]="(action | titlecase) + ' ' + (resource | upperFirst)"
+    [modalForm]="true"
+  >
+  </cd-form-button-panel>
+</cds-modal>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts
index becb1569ad6..f68619fe9ff 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.spec.ts
@@ -14,6 +14,7 @@ import { SharedModule } from '~/app/shared/shared.module';
 
 import { RgwMultisiteRealmFormComponent } from './rgw-multisite-realm-form.component';
 import { configureTestBed } from '~/testing/unit-test-helper';
+import { CheckboxModule, InputModule, ModalModule } from 'carbon-components-angular';
 
 describe('RgwMultisiteRealmFormComponent', () => {
   let component: RgwMultisiteRealmFormComponent;
@@ -26,9 +27,16 @@ describe('RgwMultisiteRealmFormComponent', () => {
       ReactiveFormsModule,
       RouterTestingModule,
       HttpClientTestingModule,
-      ToastrModule.forRoot()
+      ToastrModule.forRoot(),
+      ModalModule,
+      InputModule,
+      CheckboxModule
+    ],
+    providers: [
+      NgbActiveModal,
+      { provide: 'multisiteInfo', useValue: [[]] },
+      { provide: 'info', useValue: { data: { name: 'null' } } }
     ],
-    providers: [NgbActiveModal],
     declarations: [RgwMultisiteRealmFormComponent]
   });
 
@@ -68,7 +76,6 @@ describe('RgwMultisiteRealmFormComponent', () => {
 
     it('tests create success notification', () => {
       spyOn(rgwRealmService, 'create').and.returnValue(observableOf([]));
-      component.action = 'create';
       component.multisiteRealmForm.markAsDirty();
       component.submit();
       expect(notificationService.show).toHaveBeenCalledWith(
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts
index 20cd2032faf..1e18598b0db 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-realm-form/rgw-multisite-realm-form.component.ts
@@ -1,4 +1,4 @@
-import { Component, OnInit } from '@angular/core';
+import { Component, Inject, OnInit, Optional } from '@angular/core';
 import { UntypedFormControl, Validators } from '@angular/forms';
 import { NgbActiveModal } from '@ng-bootstrap/ng-bootstrap';
 import { RgwRealmService } from '~/app/shared/api/rgw-realm.service';
@@ -9,26 +9,21 @@ import { CdValidators } from '~/app/shared/forms/cd-validators';
 import { NotificationService } from '~/app/shared/services/notification.service';
 import { RgwRealm } from '../models/rgw-multisite';
 import { DocService } from '~/app/shared/services/doc.service';
+import { BaseModal } from 'carbon-components-angular';
 
 @Component({
   selector: 'cd-rgw-multisite-realm-form',
   templateUrl: './rgw-multisite-realm-form.component.html',
   styleUrls: ['./rgw-multisite-realm-form.component.scss']
 })
-export class RgwMultisiteRealmFormComponent implements OnInit {
-  action: string;
+export class RgwMultisiteRealmFormComponent extends BaseModal implements OnInit {
   multisiteRealmForm: CdFormGroup;
-  info: any;
-  editing = false;
-  resource: string;
-  multisiteInfo: object[] = [];
   realm: RgwRealm;
   realmList: RgwRealm[] = [];
   zonegroupList: RgwRealm[] = [];
   realmNames: string[];
   newRealmName: string;
   isMaster: boolean;
-  defaultsInfo: string[];
   defaultRealmDisabled = false;
   docUrl: string;
 
@@ -37,11 +32,17 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
     public actionLabels: ActionLabelsI18n,
     public rgwRealmService: RgwRealmService,
     public notificationService: NotificationService,
-    public docService: DocService
+    public docService: DocService,
+    @Optional() @Inject('action') public action: string,
+    @Optional() @Inject('resource') public resource: string,
+    @Optional() @Inject('info') public info: any,
+    @Optional() @Inject('multisiteInfo') public multisiteInfo: object[],
+    @Optional() @Inject('defaultsInfo') public defaultsInfo: string[],
+    @Optional() @Inject('editing') public editing: boolean
   ) {
-    this.action = this.editing
-      ? this.actionLabels.EDIT + this.resource
-      : this.actionLabels.CREATE + this.resource;
+    super();
+
+    this.action = this.editing ? this.actionLabels.EDIT : this.actionLabels.CREATE;
     this.createForm();
   }
 
@@ -52,7 +53,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
           Validators.required,
           CdValidators.custom('uniqueName', (realmName: string) => {
             return (
-              this.action === 'create' &&
+              this.action === this.actionLabels.CREATE &&
               this.realmNames &&
               this.realmNames.indexOf(realmName) !== -1
             );
@@ -71,7 +72,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
     this.realmNames = this.realmList.map((realm) => {
       return realm['name'];
     });
-    if (this.action === 'edit') {
+    if (this.action === this.actionLabels.EDIT) {
       this.zonegroupList =
         this.multisiteInfo[1] !== undefined && this.multisiteInfo[1].hasOwnProperty('zonegroups')
           ? this.multisiteInfo[1]['zonegroups']
@@ -97,7 +98,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
   submit() {
     const values = this.multisiteRealmForm.getRawValue();
     this.realm = new RgwRealm();
-    if (this.action === 'create') {
+    if (this.action === this.actionLabels.CREATE) {
       this.realm.name = values['realmName'];
       this.rgwRealmService.create(this.realm, values['default_realm']).subscribe(
         () => {
@@ -105,13 +106,13 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
             NotificationType.success,
             $localize`Realm: '${values['realmName']}' created successfully`
           );
-          this.activeModal.close();
+          this.closeModal();
         },
         () => {
           this.multisiteRealmForm.setErrors({ cdSubmitButton: true });
         }
       );
-    } else if (this.action === 'edit') {
+    } else {
       this.realm.name = this.info.data.name;
       this.newRealmName = values['realmName'];
       this.rgwRealmService.update(this.realm, values['default_realm'], this.newRealmName).subscribe(
@@ -120,7 +121,7 @@ export class RgwMultisiteRealmFormComponent implements OnInit {
             NotificationType.success,
             $localize`Realm: '${values['realmName']}' updated successfully`
           );
-          this.activeModal.close();
+          this.closeModal();
         },
         () => {
           this.multisiteRealmForm.setErrors({ cdSubmitButton: true });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
index e50666cdeaa..767305958d4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.html
@@ -64,6 +64,9 @@
                    i18n-placeholder
                    placeholder="Source Bucket Name..."
                    formControlName="source_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
           </div>
         <div class="form-group row">
@@ -78,6 +81,9 @@
                    i18n-placeholder
                    placeholder="Destination Bucket Name..."
                    formControlName="destination_bucket"/>
+            <cd-help-text>
+              <span i18n>{{ allBucketSelectedHelpText }}</span>
+            </cd-help-text>
           </div>
         </div>
       </div>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
index 369658d7d42..1127db1c59a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.spec.ts
@@ -89,6 +89,47 @@ describe('RgwMultisiteSyncPipeModalComponent', () => {
     component.submit();
     expect(spy).toHaveBeenCalled();
     expect(putDataSpy).toHaveBeenCalled();
-    expect(putDataSpy).toHaveBeenCalledWith(component.pipeForm.getRawValue());
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: '',
+      user: ''
+    });
+  });
+
+  it('should pass "user" and "mode" while creating/editing pipe', () => {
+    component.editing = true;
+    component.pipeForm.patchValue({
+      pipe_id: 'pipe1',
+      group_id: 's3-bucket-replication:enabled',
+      source_bucket: '',
+      source_zones: { added: ['zone1-zg1-realm1'], removed: [] },
+      destination_bucket: '',
+      destination_zones: { added: ['zone2-zg1-realm1'], removed: [] }
+    });
+    component.pipeSelectedRow = {
+      dest: { bucket: '*', zones: ['zone2-zg1-realm1'] },
+      id: 'pipi1',
+      params: {
+        dest: {},
+        mode: 'user',
+        priority: 0,
+        source: { filter: { tags: [] } },
+        user: 'dashboard'
+      },
+      source: { bucket: '*', zones: ['zone1-zg1-realm1'] }
+    };
+
+    component.sourceZones.data.selected = ['zone1-zg1-realm1'];
+    component.destZones.data.selected = ['zone2-zg1-realm1'];
+    const spy = jest.spyOn(component, 'submit');
+    const putDataSpy = jest.spyOn(multisiteServiceMock, 'createEditSyncPipe');
+    component.submit();
+    expect(spy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalled();
+    expect(putDataSpy).toHaveBeenCalledWith({
+      ...component.pipeForm.getRawValue(),
+      mode: 'user',
+      user: 'dashboard'
+    });
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
index 2f41dbd23c8..43742ef60b8 100755
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-pipe-modal/rgw-multisite-sync-pipe-modal.component.ts
@@ -18,6 +18,8 @@ import { ZoneData } from '../models/rgw-multisite-zone-selector';
 import { SucceededActionLabelsI18n } from '~/app/shared/constants/app.constants';
 
 const ALL_ZONES = $localize`All zones (*)`;
+const ALL_BUCKET_SELECTED_HELP_TEXT =
+  'If no value is provided, all the buckets in the zone group will be selected.';
 
 @Component({
   selector: 'cd-rgw-multisite-sync-pipe-modal',
@@ -33,6 +35,7 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
   sourceZones = new ZoneData(false, 'Filter Zones');
   destZones = new ZoneData(false, 'Filter Zones');
   icons = Icons;
+  allBucketSelectedHelpText = ALL_BUCKET_SELECTED_HELP_TEXT;
 
   constructor(
     public activeModal: NgbActiveModal,
@@ -187,7 +190,9 @@ export class RgwMultisiteSyncPipeModalComponent implements OnInit {
       .createEditSyncPipe({
         ...this.pipeForm.getRawValue(),
         source_zones: sourceZones,
-        destination_zones: destZones
+        destination_zones: destZones,
+        user: this.editing ? this.pipeSelectedRow?.params?.user : '',
+        mode: this.editing ? this.pipeSelectedRow?.params?.mode : ''
       })
       .subscribe(
         () => {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
index ee261db5042..03228856125 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-sync-policy/rgw-multisite-sync-policy.component.ts
@@ -88,12 +88,22 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
       {
         name: $localize`Zonegroup`,
         prop: 'zonegroup',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       },
       {
         name: $localize`Bucket`,
         prop: 'bucket',
-        flexGrow: 1
+        flexGrow: 1,
+        cellTransformation: CellTemplate.map,
+        customTemplateConfig: {
+          undefined: '-',
+          '': '-'
+        }
       }
     ];
     this.rgwDaemonService.list().subscribe();
@@ -137,7 +147,7 @@ export class RgwMultisiteSyncPolicyComponent extends ListWithDetails implements
           groupName: policy['id'],
           status: policy['status'],
           bucket: policy['bucketName'],
-          zonegroup: ''
+          zonegroup: policy['zonegroup']
         });
       });
       this.syncPolicyData = [...this.syncPolicyData];
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts
index 3d4b06528c1..2fbe1163ef8 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-wizard/rgw-multisite-wizard.component.ts
@@ -236,6 +236,7 @@ export class RgwMultisiteWizardComponent extends BaseModal implements OnInit {
         )
         .subscribe((data: object[]) => {
           this.setupCompleted = true;
+          this.rgwMultisiteService.setRestartGatewayMessage(false);
           this.loading = false;
           this.realms = data;
           this.showSuccessNotification();
@@ -258,6 +259,7 @@ export class RgwMultisiteWizardComponent extends BaseModal implements OnInit {
         .subscribe(
           () => {
             this.setupCompleted = true;
+            this.rgwMultisiteService.setRestartGatewayMessage(false);
             this.loading = false;
             this.showSuccessNotification();
           },
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zone-form/rgw-multisite-zone-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zone-form/rgw-multisite-zone-form.component.html
index e6211e7d2f3..e6ad0603f17 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zone-form/rgw-multisite-zone-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zone-form/rgw-multisite-zone-form.component.html
@@ -63,7 +63,7 @@
               </cd-helper>
             </span>
             <cd-helper *ngIf="action === 'edit' && !isDefaultZone">
-              <span i18n>Please consult the&nbsp;<a href="{{ docUrl }}">documentation</a>&nbsp;to follow the failover mechanism</span>
+              <span i18n>Please consult the&nbsp;<cd-doc section="rgw-multisite"></cd-doc>&nbsp;to follow the failover mechanism</span>
             </cd-helper><br>
           </div>
           <div class="custom-control custom-checkbox">
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zonegroup-form/rgw-multisite-zonegroup-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zonegroup-form/rgw-multisite-zonegroup-form.component.html
index 6e1a32e386b..fe32f082cbc 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zonegroup-form/rgw-multisite-zonegroup-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-zonegroup-form/rgw-multisite-zonegroup-form.component.html
@@ -58,7 +58,7 @@
             <cd-helper i18n>Zone group doesn't belong to the default realm.</cd-helper>
           </span>
           <cd-helper *ngIf="action === 'edit' && !info.data.is_default">
-            <span i18n>Please consult the&nbsp;<a href="{{ docUrl }}">documentation</a>&nbsp;to follow the failover mechanism</span>
+            <span i18n>Please consult the&nbsp;<cd-doc section="rgw-multisite"></cd-doc>&nbsp;to follow the failover mechanism</span>
           </cd-helper>
           <cd-helper *ngIf="action === 'edit' && info.data.is_default">
             <span i18n>You cannot unset the default flag.</span>
@@ -76,7 +76,7 @@
             <cd-helper i18n>Multiple master zone groups can't be configured. If you want to create a new zone group and make it the master zone group, you must delete the default zone group.</cd-helper>
           </span>
           <cd-helper *ngIf="action === 'edit' && !info.data.is_master">
-            <span i18n>Please consult the&nbsp;<a href="{{ docUrl }}">documentation</a>&nbsp;to follow the failover mechanism</span>
+            <span i18n>Please consult the&nbsp;<cd-doc section="rgw-multisite"></cd-doc>&nbsp;to follow the failover mechanism</span>
           </cd-helper>
           <cd-helper *ngIf="action === 'edit' && info.data.is_master">
             <span i18n>You cannot unset the master flag.</span>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
index 8b5901769c3..00037a7235b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-overview-dashboard/rgw-overview-dashboard.component.ts
@@ -91,7 +91,9 @@ export class RgwOverviewDashboardComponent implements OnInit, OnDestroy {
         this.totalPoolUsedBytes = data['total_pool_bytes_used'];
         this.averageObjectSize = data['average_object_size'];
       });
-      this.getSyncStatus();
+      setTimeout(() => {
+        this.getSyncStatus();
+      });
     });
     this.BucketSub = this.rgwBucketService
       .getTotalBucketsAndUsersLength()
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
index 3439562c8e2..a55cb179778 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw.module.ts
@@ -70,7 +70,10 @@ import {
   IconModule,
   LoadingModule,
   ModalModule,
-  ProgressIndicatorModule
+  ProgressIndicatorModule,
+  CodeSnippetModule,
+  InputModule,
+  CheckboxModule
 } from 'carbon-components-angular';
 import { CephSharedModule } from '../shared/ceph-shared.module';
 
@@ -94,10 +97,13 @@ import { CephSharedModule } from '../shared/ceph-shared.module';
     ModalModule,
     GridModule,
     ProgressIndicatorModule,
+    CodeSnippetModule,
     ButtonModule,
     LoadingModule,
     IconModule,
-    NgbProgressbar
+    NgbProgressbar,
+    InputModule,
+    CheckboxModule
   ],
   exports: [
     RgwDaemonListComponent,
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts
index 244a7861b27..8b2c9f1eca3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-details/role-details.component.ts
@@ -31,28 +31,28 @@ export class RoleDetailsComponent implements OnChanges, OnInit {
         prop: 'read',
         name: $localize`Read`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       },
       {
         prop: 'create',
         name: $localize`Create`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       },
       {
         prop: 'update',
         name: $localize`Update`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       },
       {
         prop: 'delete',
         name: $localize`Delete`,
         flexGrow: 1,
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         cellTransformation: CellTemplate.checkIcon
       }
     ];
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts
index c9640e4ffab..8e7e12b3692 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/core/auth/role-list/role-list.component.ts
@@ -99,7 +99,7 @@ export class RoleListComponent extends ListWithDetails implements OnInit {
       {
         name: $localize`System Role`,
         prop: 'system',
-        cellClass: 'text-center',
+        cellClass: 'text-left',
         flexGrow: 1,
         cellTransformation: CellTemplate.checkIcon
       }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
index 313db3445f2..92eee852d88 100644..100755
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.spec.ts
@@ -2,13 +2,15 @@ import { TestBed } from '@angular/core/testing';
 import { HttpClientTestingModule, HttpTestingController } from '@angular/common/http/testing';
 import { configureTestBed } from '~/testing/unit-test-helper';
 import { NvmeofService } from '../../shared/api/nvmeof.service';
+import { throwError } from 'rxjs';
 
 describe('NvmeofService', () => {
   let service: NvmeofService;
   let httpTesting: HttpTestingController;
   const mockGroupName = 'default';
   const mockNQN = 'nqn.2001-07.com.ceph:1721041732363';
-
+  const UI_API_PATH = 'ui-api/nvmeof';
+  const API_PATH = 'api/nvmeof';
   configureTestBed({
     providers: [NvmeofService],
     imports: [HttpClientTestingModule]
@@ -27,51 +29,155 @@ describe('NvmeofService', () => {
     expect(service).toBeTruthy();
   });
 
-  it('should call listGatewayGroups', () => {
-    service.listGatewayGroups().subscribe();
-    const req = httpTesting.expectOne('api/nvmeof/gateway/group');
-    expect(req.request.method).toBe('GET');
-  });
+  describe('test gateway APIs', () => {
+    it('should call listGatewayGroups', () => {
+      service.listGatewayGroups().subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/gateway/group`);
+      expect(req.request.method).toBe('GET');
+    });
 
-  it('should call listGateways', () => {
-    service.listGateways().subscribe();
-    const req = httpTesting.expectOne('api/nvmeof/gateway');
-    expect(req.request.method).toBe('GET');
+    it('should call listGateways', () => {
+      service.listGateways().subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/gateway`);
+      expect(req.request.method).toBe('GET');
+    });
   });
 
-  it('should call listSubsystems', () => {
-    service.listSubsystems(mockGroupName).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem?gw_group=${mockGroupName}`);
-    expect(req.request.method).toBe('GET');
-  });
+  describe('test subsystems APIs', () => {
+    it('should call listSubsystems', () => {
+      service.listSubsystems(mockGroupName).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem?gw_group=${mockGroupName}`);
+      expect(req.request.method).toBe('GET');
+    });
+
+    it('should call getSubsystem', () => {
+      service.getSubsystem(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+
+    it('should call createSubsystem', () => {
+      const request = {
+        nqn: mockNQN,
+        enable_ha: true,
+        initiators: '*',
+        gw_group: mockGroupName
+      };
+      service.createSubsystem(request).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem`);
+      expect(req.request.method).toBe('POST');
+    });
 
-  it('should call getSubsystem', () => {
-    service.getSubsystem(mockNQN, mockGroupName).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}?gw_group=${mockGroupName}`);
-    expect(req.request.method).toBe('GET');
+    it('should call deleteSubsystem', () => {
+      service.deleteSubsystem(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
+    it('should call isSubsystemPresent', () => {
+      spyOn(service, 'getSubsystem').and.returnValue(throwError('test'));
+      service.isSubsystemPresent(mockNQN, mockGroupName).subscribe((res) => {
+        expect(res).toBe(false);
+      });
+    });
   });
 
-  it('should call createSubsystem', () => {
-    const request = {
-      nqn: mockNQN,
-      enable_ha: true,
-      initiators: '*',
-      gw_group: mockGroupName
-    };
-    service.createSubsystem(request).subscribe();
-    const req = httpTesting.expectOne('api/nvmeof/subsystem');
-    expect(req.request.method).toBe('POST');
+  describe('test initiators APIs', () => {
+    let request = { host_nqn: '', gw_group: mockGroupName };
+    it('should call getInitiators', () => {
+      service.getInitiators(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/host?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call addInitiators', () => {
+      service.addInitiators(mockNQN, request).subscribe();
+      const req = httpTesting.expectOne(`${UI_API_PATH}/subsystem/${mockNQN}/host`);
+      expect(req.request.method).toBe('POST');
+    });
+    it('should call removeInitiators', () => {
+      service.removeInitiators(mockNQN, request).subscribe();
+      const req = httpTesting.expectOne(
+        `${UI_API_PATH}/subsystem/${mockNQN}/host/${request.host_nqn}/${mockGroupName}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
   });
 
-  it('should call deleteSubsystem', () => {
-    service.deleteSubsystem(mockNQN, mockGroupName).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}?gw_group=${mockGroupName}`);
-    expect(req.request.method).toBe('DELETE');
+  describe('test listener APIs', () => {
+    it('it should listListeners', () => {
+      service.listListeners(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/listener?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call createListener', () => {
+      const request = {
+        gw_group: mockGroupName,
+        host_name: 'ceph-node-02',
+        traddr: '192.168.100.102',
+        trsvcid: 4421
+      };
+      service.createListener(mockNQN, request).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem/${mockNQN}/listener`);
+      expect(req.request.method).toBe('POST');
+    });
+    it('should call deleteListener', () => {
+      const request = { host_name: 'ceph-node-02', traddr: '192.168.100.102', trsvcid: '4421' };
+      service
+        .deleteListener(mockNQN, request.host_name, request.traddr, request.trsvcid)
+        .subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/listener/${request.host_name}/${request.traddr}?trsvcid=${request.trsvcid}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
   });
 
-  it('should call getInitiators', () => {
-    service.getInitiators(mockNQN).subscribe();
-    const req = httpTesting.expectOne(`api/nvmeof/subsystem/${mockNQN}/host`);
-    expect(req.request.method).toBe('GET');
+  describe('test namespace APIs', () => {
+    const mockNsid = '1';
+    it('should call listNamespaces', () => {
+      service.listNamespaces(mockNQN, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/namespace?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call getNamespace', () => {
+      service.getNamespace(mockNQN, mockNsid, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/namespace/${mockNsid}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('GET');
+    });
+    it('should call createNamespace', () => {
+      const mockNamespaceObj = {
+        rbd_image_name: 'nvme_ns_image:12345678',
+        rbd_pool: 'rbd',
+        size: 1024,
+        gw_group: mockGroupName
+      };
+      service.createNamespace(mockNQN, mockNamespaceObj).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem/${mockNQN}/namespace`);
+      expect(req.request.method).toBe('POST');
+    });
+    it('should call updateNamespace', () => {
+      const request = { rbd_image_size: 1024, gw_group: mockGroupName };
+      service.updateNamespace(mockNQN, mockNsid, request).subscribe();
+      const req = httpTesting.expectOne(`${API_PATH}/subsystem/${mockNQN}/namespace/${mockNsid}`);
+      expect(req.request.method).toBe('PATCH');
+    });
+    it('should call deleteNamespace', () => {
+      service.deleteNamespace(mockNQN, mockNsid, mockGroupName).subscribe();
+      const req = httpTesting.expectOne(
+        `${API_PATH}/subsystem/${mockNQN}/namespace/${mockNsid}?gw_group=${mockGroupName}`
+      );
+      expect(req.request.method).toBe('DELETE');
+    });
   });
 });
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
index 40202d0d672..a2bbf507bc3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
@@ -8,6 +8,7 @@ import { catchError, mapTo } from 'rxjs/operators';
 export const MAX_NAMESPACE = 1024;
 
 export interface ListenerRequest {
+  gw_group: string;
   host_name: string;
   traddr: string;
   trsvcid: number;
@@ -17,14 +18,17 @@ export interface NamespaceCreateRequest {
   rbd_image_name: string;
   rbd_pool: string;
   size: number;
+  gw_group: string;
 }
 
 export interface NamespaceEditRequest {
   rbd_image_size: number;
+  gw_group: string;
 }
 
 export interface InitiatorRequest {
   host_nqn: string;
+  gw_group: string;
 }
 
 const API_PATH = 'api/nvmeof';
@@ -81,8 +85,8 @@ export class NvmeofService {
   }
 
   // Initiators
-  getInitiators(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host`);
+  getInitiators(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/host?gw_group=${group}`);
   }
 
   addInitiators(subsystemNQN: string, request: InitiatorRequest) {
@@ -92,14 +96,17 @@ export class NvmeofService {
   }
 
   removeInitiators(subsystemNQN: string, request: InitiatorRequest) {
-    return this.http.delete(`${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}`, {
-      observe: 'response'
-    });
+    return this.http.delete(
+      `${UI_API_PATH}/subsystem/${subsystemNQN}/host/${request.host_nqn}/${request.gw_group}`,
+      {
+        observe: 'response'
+      }
+    );
   }
 
   // Listeners
-  listListeners(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener`);
+  listListeners(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/listener?gw_group=${group}`);
   }
 
   createListener(subsystemNQN: string, request: ListenerRequest) {
@@ -121,12 +128,14 @@ export class NvmeofService {
   }
 
   // Namespaces
-  listNamespaces(subsystemNQN: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace`);
+  listNamespaces(subsystemNQN: string, group: string) {
+    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace?gw_group=${group}`);
   }
 
-  getNamespace(subsystemNQN: string, nsid: string) {
-    return this.http.get(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`);
+  getNamespace(subsystemNQN: string, nsid: string, group: string) {
+    return this.http.get(
+      `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`
+    );
   }
 
   createNamespace(subsystemNQN: string, request: NamespaceCreateRequest) {
@@ -141,9 +150,12 @@ export class NvmeofService {
     });
   }
 
-  deleteNamespace(subsystemNQN: string, nsid: string) {
-    return this.http.delete(`${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}`, {
-      observe: 'response'
-    });
+  deleteNamespace(subsystemNQN: string, nsid: string, group: string) {
+    return this.http.delete(
+      `${API_PATH}/subsystem/${subsystemNQN}/namespace/${nsid}?gw_group=${group}`,
+      {
+        observe: 'response'
+      }
+    );
   }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
index d1f9997791a..c81c9193a2e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.spec.ts
@@ -3,6 +3,7 @@ import { TestBed } from '@angular/core/testing';
 
 import { configureTestBed } from '~/testing/unit-test-helper';
 import { OsdService } from './osd.service';
+import { CdTableFetchDataContext } from '../models/cd-table-fetch-data-context';
 
 describe('OsdService', () => {
   let service: OsdService;
@@ -64,8 +65,9 @@ describe('OsdService', () => {
   });
 
   it('should call getList', () => {
-    service.getList().subscribe();
-    const req = httpTesting.expectOne('api/osd');
+    const context = new CdTableFetchDataContext(() => {});
+    service.getList(context.toParams()).observable.subscribe();
+    const req = httpTesting.expectOne('api/osd?offset=0&limit=10&search=&sort=%2Bname');
     expect(req.request.method).toBe('GET');
   });
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
index f2ed4d7cc9e..85a75073dea 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/osd.service.ts
@@ -1,4 +1,4 @@
-import { HttpClient } from '@angular/common/http';
+import { HttpClient, HttpParams } from '@angular/common/http';
 import { Injectable } from '@angular/core';
 
 import _ from 'lodash';
@@ -12,6 +12,9 @@ import { OsdSettings } from '../models/osd-settings';
 import { SmartDataResponseV1 } from '../models/smart';
 import { DeviceService } from '../services/device.service';
 import { CdFormGroup } from '../forms/cd-form-group';
+import { PaginateObservable } from './paginate.model';
+import { PaginateParams } from '../classes/paginate-params.class';
+import { Osd } from '../models/osd.model';
 
 @Injectable({
   providedIn: 'root'
@@ -80,8 +83,10 @@ export class OsdService {
     return this.http.post(this.path, request, { observe: 'response' });
   }
 
-  getList() {
-    return this.http.get(`${this.path}`);
+  getList(params: HttpParams): PaginateObservable<Osd[]> {
+    return new PaginateObservable<Osd[]>(
+      this.http.get<Osd[]>(this.path, new PaginateParams(params, 1, 1))
+    );
   }
 
   getOsdSettings(): Observable<OsdSettings> {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
index 703792a7571..77ec4e43f7c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/paginate.model.ts
@@ -9,7 +9,7 @@ export class PaginateObservable<Type> {
     this.observable = obs.pipe(
       map((response: any) => {
         this.count = Number(response.headers?.get('X-Total-Count'));
-        return response['body'];
+        return response['body'] || response;
       })
     );
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
index d57cd523a4d..3dc886e172f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/rgw-multisite.service.ts
@@ -2,6 +2,7 @@ import { HttpClient, HttpParams } from '@angular/common/http';
 import { Injectable } from '@angular/core';
 import { RgwRealm, RgwZone, RgwZonegroup } from '~/app/ceph/rgw/models/rgw-multisite';
 import { RgwDaemonService } from './rgw-daemon.service';
+import { BehaviorSubject } from 'rxjs';
 
 @Injectable({
   providedIn: 'root'
@@ -10,6 +11,9 @@ export class RgwMultisiteService {
   private uiUrl = 'ui-api/rgw/multisite';
   private url = 'api/rgw/multisite';
 
+  private restartGatewayMessageSource = new BehaviorSubject<boolean>(null);
+  restartGatewayMessage$ = this.restartGatewayMessageSource.asObservable();
+
   constructor(private http: HttpClient, public rgwDaemonService: RgwDaemonService) {}
 
   migrate(realm: RgwRealm, zonegroup: RgwZonegroup, zone: RgwZone) {
@@ -28,7 +32,9 @@ export class RgwMultisiteService {
   }
 
   getSyncStatus() {
-    return this.http.get(`${this.url}/sync_status`);
+    return this.rgwDaemonService.request((params: HttpParams) => {
+      return this.http.get(`${this.url}/sync_status`, { params: params });
+    });
   }
 
   status() {
@@ -123,8 +129,15 @@ export class RgwMultisiteService {
     );
   }
 
-  createEditSyncPipe(payload: any) {
-    return this.http.put(`${this.url}/sync-pipe`, payload);
+  createEditSyncPipe(payload: any, user?: string, mode?: string) {
+    let params = new HttpParams();
+    if (user) {
+      params = params.append('user', user);
+    }
+    if (mode) {
+      params = params.append('mode', mode);
+    }
+    return this.http.put(`${this.url}/sync-pipe`, payload, { params });
   }
 
   removeSyncPipe(pipe_id: string, group_id: string, bucket_name?: string) {
@@ -137,4 +150,8 @@ export class RgwMultisiteService {
       { params }
     );
   }
+
+  setRestartGatewayMessage(value: boolean): void {
+    this.restartGatewayMessageSource.next(value);
+  }
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
new file mode 100644
index 00000000000..a1b079b426b
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/classes/paginate-params.class.ts
@@ -0,0 +1,15 @@
+import { HttpParams } from '@angular/common/http';
+
+export class PaginateParams {
+  constructor(params: HttpParams, majorVersion = 1, minorVersion = 0) {
+    const options = {
+      params: params,
+      headers: {
+        Accept: `application/vnd.ceph.api.v${majorVersion}.${minorVersion}+json`
+      }
+    };
+
+    options['observe'] = 'response';
+    return options;
+  }
+}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss
index f7be01cd929..653ea5993a2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/help-text/help-text.component.scss
@@ -1,3 +1,3 @@
-::ng-deep legend .text-muted {
+.form-text {
   font-size: small;
 }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/constants/cephfs.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/constants/cephfs.ts
deleted file mode 100644
index 56890ff7214..00000000000
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/constants/cephfs.ts
+++ /dev/null
@@ -1 +0,0 @@
-export const DEFAULT_SUBVOLUME_GROUP = '_nogroup';
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts
index d8304127fab..51120f623f2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table-actions/table-actions.component.ts
@@ -147,7 +147,9 @@ export class TableActionsComponent implements OnChanges, OnInit {
   useDisableDesc(action: CdTableAction) {
     if (action.disable) {
       const result = action.disable(this.selection);
-      return _.isString(result) ? result : undefined;
+      return _.isString(result) ? result : action.title ? action.title : undefined;
+    } else if (action.title) {
+      return action.title;
     }
     return undefined;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts
index e832665c5dc..f773422ac19 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-action.ts
@@ -19,6 +19,8 @@ export class CdTableAction {
   // The font awesome icon that will be used
   icon: string;
 
+  // For adding the default tooltip
+  title?: string;
   /**
    * You can define the condition to disable the action.
    * By default all 'update' and 'delete' actions will only be enabled
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
index 0df2d2ebbe0..6ea415bfee9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/cd-table-fetch-data-context.ts
@@ -18,7 +18,7 @@ export class CdTableFetchDataContext {
   search = '';
   sort = '+name';
 
-  constructor(error: () => void) {
+  constructor(error?: () => void) {
     this.error = error;
   }
 
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
new file mode 100644
index 00000000000..f22987e439e
--- /dev/null
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/models/osd.model.ts
@@ -0,0 +1,49 @@
+/* We will need to check what are all the value that the
+   UI need and only make them the mandatory parameters here.
+   For now based on what I saw in the unit test file;
+   osd-list.component.spec.ts, I've made the decision to make
+   things optional and non-optional. This should be re-evaluated. */
+
+export interface Osd {
+  id: number;
+  host: Host;
+  stats_history: StatsHistory;
+  state: string[];
+  stats: Stats;
+  collectedStates?: string[];
+  in?: number;
+  out?: number;
+  up?: number;
+  down?: number;
+  destroyed?: number;
+  cdIsBinary?: boolean;
+  cdIndivFlags?: string[];
+  cdClusterFlags?: string[];
+  cdExecuting?: any;
+  tree?: Tree;
+  operational_status?: string;
+}
+
+interface Tree {
+  device_class: string;
+}
+
+interface Host {
+  id: number;
+  name: string;
+}
+
+interface StatsHistory {
+  op_out_bytes: any[];
+  op_in_bytes: any[];
+  out_bytes?: any[];
+  in_bytes?: any[];
+}
+
+interface Stats {
+  stat_bytes_used: number;
+  stat_bytes: number;
+  op_w?: number;
+  op_r?: number;
+  usage?: number;
+}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
index 59d7572e9f0..45cca684dab 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/xml.pipe.ts
@@ -7,9 +7,13 @@ import { JsonToXmlService } from '../services/json-to-xml.service';
 export class XmlPipe implements PipeTransform {
   constructor(private jsonToXmlService: JsonToXmlService) {}
 
-  transform(value: string, valueFormat: string = 'json'): string {
+  transform(
+    value: string,
+    replaceKey: Record<string, string> = {},
+    valueFormat: string = 'json'
+  ): string {
     if (valueFormat === 'json') {
-      value = this.jsonToXmlService.format(value);
+      value = this.jsonToXmlService.format(value, replaceKey);
     }
     return value;
   }
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
index 8f1d128c0c5..e9d30f9b7f2 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/json-to-xml.service.ts
@@ -6,29 +6,39 @@ import { Injectable } from '@angular/core';
 export class JsonToXmlService {
   constructor() {}
 
-  format(json: any, indentSize: number = 2, currentIndent: number = 0): string {
+  format(
+    json: any,
+    replaceKey: Record<string, string> = null,
+    indentSize: number = 2,
+    currentIndent: number = 0
+  ): string {
     if (!json) return null;
     let xml = '';
     if (typeof json === 'string') {
       json = JSON.parse(json);
     }
 
-    for (const key in json) {
+    for (let key in json) {
       if (json.hasOwnProperty(key)) {
         const value = json[key];
         const indentation = ' '.repeat(currentIndent);
-
+        if (replaceKey) {
+          const [oldKey, newKey] = Object.entries(replaceKey)[0];
+          if (key === oldKey) {
+            key = newKey;
+          }
+        }
         if (Array.isArray(value)) {
           value.forEach((item) => {
             xml +=
               `${indentation}<${key}>\n` +
-              this.format(item, indentSize, currentIndent + indentSize) +
+              this.format(item, replaceKey, indentSize, currentIndent + indentSize) +
               `${indentation}</${key}>\n`;
           });
         } else if (typeof value === 'object') {
           xml +=
             `${indentation}<${key}>\n` +
-            this.format(value, indentSize, currentIndent + indentSize) +
+            this.format(value, replaceKey, indentSize, currentIndent + indentSize) +
             `${indentation}</${key}>\n`;
         } else {
           xml += `${indentation}<${key}>${value}</${key}>\n`;
diff --git a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
index 1d12facaf6a..61ca421101e 100644
--- a/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
+++ b/src/pybind/mgr/dashboard/frontend/src/styles/_carbon-defaults.scss
@@ -142,3 +142,10 @@ Dashboard page
 cd-dashboard {
   font-size: 12px;
 }
+
+/******************************************
+Code snippet
+******************************************/
+.cds--snippet {
+  width: fit-content;
+}
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index e8ab663d0d5..b464344e27a 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8293,6 +8293,7 @@ paths:
                   description: Enable high availability
                   type: boolean
                 gw_group:
+                  description: NVMeoF gateway group
                   type: string
                 max_namespaces:
                   default: 1024
@@ -8346,6 +8347,7 @@ paths:
         schema:
           type: boolean
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8384,6 +8386,7 @@ paths:
         schema:
           type: string
       - allowEmptyValue: true
+        description: NVMeoF gateway group
         in: query
         name: gw_group
         schema:
@@ -8417,6 +8420,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8446,6 +8455,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8479,6 +8494,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_nqn:
                   description: NVMeoF host NQN. Use "*" to allow any host.
                   type: string
@@ -8525,6 +8543,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8559,6 +8583,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8596,6 +8626,9 @@ paths:
                   default: 0
                   description: NVMeoF address family (0 - IPv4, 1 - IPv6)
                   type: integer
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 host_name:
                   description: NVMeoF hostname
                   type: string
@@ -8673,6 +8706,12 @@ paths:
         name: force
         schema:
           type: boolean
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8707,6 +8746,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8748,6 +8793,9 @@ paths:
                   default: true
                   description: Create RBD image
                   type: boolean
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8805,6 +8853,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '202':
           content:
@@ -8844,6 +8898,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8883,6 +8943,9 @@ paths:
           application/json:
             schema:
               properties:
+                gw_group:
+                  description: NVMeoF gateway group
+                  type: string
                 load_balancing_group:
                   description: Load balancing group
                   type: integer
@@ -8937,6 +9000,12 @@ paths:
         required: true
         schema:
           type: string
+      - allowEmptyValue: true
+        description: NVMeoF gateway group
+        in: query
+        name: gw_group
+        schema:
+          type: string
       responses:
         '200':
           content:
@@ -8959,11 +9028,31 @@ paths:
       - NVMe-oF Subsystem Namespace
   /api/osd:
     get:
-      parameters: []
+      parameters:
+      - default: 0
+        in: query
+        name: offset
+        schema:
+          type: integer
+      - default: 10
+        in: query
+        name: limit
+        schema:
+          type: integer
+      - default: ''
+        in: query
+        name: search
+        schema:
+          type: string
+      - default: ''
+        in: query
+        name: sort
+        schema:
+          type: string
       responses:
         '200':
           content:
-            application/vnd.ceph.api.v1.0+json:
+            application/vnd.ceph.api.v1.1+json:
               type: object
           description: OK
         '400':
@@ -11104,7 +11193,6 @@ paths:
                   type: string
               required:
               - bucket_id
-              - uid
               type: object
       responses:
         '200':
@@ -11384,6 +11472,9 @@ paths:
                   type: string
                 group_id:
                   type: string
+                mode:
+                  default: ''
+                  type: string
                 pipe_id:
                   type: string
                 source_bucket:
@@ -11391,6 +11482,9 @@ paths:
                   type: string
                 source_zones:
                   type: string
+                user:
+                  default: ''
+                  type: string
               required:
               - group_id
               - pipe_id
@@ -11447,11 +11541,6 @@ paths:
           type: string
       - default: ''
         in: query
-        name: destination_bucket
-        schema:
-          type: string
-      - default: ''
-        in: query
         name: bucket_name
         schema:
           type: string
@@ -11677,7 +11766,12 @@ paths:
       - RgwMultisite
   /api/rgw/multisite/sync_status:
     get:
-      parameters: []
+      parameters:
+      - allowEmptyValue: true
+        in: query
+        name: daemon_name
+        schema:
+          type: string
       responses:
         '200':
           content:
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_client.py b/src/pybind/mgr/dashboard/services/nvmeof_client.py
index d6b126500b0..e0ea6d1e48b 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_client.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_client.py
@@ -22,7 +22,7 @@ else:
     class NVMeoFClient(object):
         pb2 = pb2
 
-        def __init__(self, gw_group: Optional[str] = None):
+        def __init__(self, gw_group: Optional[str] = None, traddr: Optional[str] = None):
             logger.info("Initiating nvmeof gateway connection...")
             try:
                 if not gw_group:
@@ -36,6 +36,23 @@ else:
                     f'Unable to retrieve the gateway info: {e}'
                 )
 
+            # While creating listener need to direct request to the gateway
+            # address where listener is supposed to be added.
+            if traddr:
+                gateways_info = NvmeofGatewaysConfig.get_gateways_config()
+                matched_gateway = next(
+                    (
+                        gateway
+                        for gateways in gateways_info['gateways'].values()
+                        for gateway in gateways
+                        if traddr in gateway['service_url']
+                    ),
+                    None
+                )
+                if matched_gateway:
+                    self.gateway_addr = matched_gateway.get('service_url')
+                    logger.debug("Gateway address set to: %s", self.gateway_addr)
+
             root_ca_cert = NvmeofGatewaysConfig.get_root_ca_cert(service_name)
             if root_ca_cert:
                 client_key = NvmeofGatewaysConfig.get_client_key(service_name)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index a5a9979af25..2426c599078 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -93,9 +93,9 @@ class NvmeofGatewaysConfig(object):
                 return None
 
             if group:
-                return cls._get_name_url_for_group(gateways, group)
+                return _get_name_url_for_group(gateways, group)
 
-            return cls._get_default_service(gateways)
+            return _get_default_service(gateways)
 
         except (KeyError, IndexError) as e:
             raise DashboardException(
@@ -129,52 +129,66 @@ class NvmeofGatewaysConfig(object):
             orch = OrchClient.instance()
             if orch.available():
                 if key:
-                    return orch.cert_store.get_key(entity, service_name)
-                return orch.cert_store.get_cert(entity, service_name)
+                    return orch.cert_store.get_key(entity, service_name,
+                                                   ignore_missing_exception=True)
+                return orch.cert_store.get_cert(entity, service_name,
+                                                ignore_missing_exception=True)
             return None
         except OrchestratorError:
             # just return None if any orchestrator error is raised
             # otherwise nvmeof api will raise this error and doesn't proceed.
             return None
 
-    @classmethod
-    def _get_name_url_for_group(cls, gateways, group):
-        try:
-            orch = OrchClient.instance()
-            for service_name, svc_config in gateways.items():
-                # get the group name of the service and match it against the
-                # group name provided
-                group_name_from_svc = orch.services.get(service_name)[0].spec.group
-                if group == group_name_from_svc:
-                    running_daemons = cls._get_running_daemons(orch, service_name)
-                    config = cls._get_running_daemon_svc_config(svc_config, running_daemons)
-
-                    if config:
-                        return service_name, config['service_url']
-            return None
 
-        except OrchestratorError:
-            return cls._get_default_service(gateways)
+def _get_name_url_for_group(gateways, group):
+    try:
+        orch = OrchClient.instance()
+        for service_name, svc_config in gateways.items():
+            # get the group name of the service and match it against the
+            # group name provided
+            group_name_from_svc = orch.services.get(service_name)[0].spec.group
+            if group == group_name_from_svc:
+                running_daemons = _get_running_daemons(orch, service_name)
+                config = _get_running_daemon_svc_config(svc_config, running_daemons)
 
-    @classmethod
-    def _get_running_daemons(cls, orch, service_name):
-        # get the running nvmeof daemons
-        daemons = [d.to_dict()
-                   for d in orch.services.list_daemons(service_name=service_name)]
-        return [d['daemon_name'] for d in daemons
-                if d['status_desc'] == 'running']
+                if config:
+                    return service_name, config['service_url']
+        return None
 
-    @classmethod
-    def _get_running_daemon_svc_config(cls, svc_config, running_daemons):
-        try:
-            return next(config for config in svc_config
-                        if config['daemon_name'] in running_daemons)
-        except StopIteration:
-            return None
+    except OrchestratorError:
+        return _get_default_service(gateways)
 
-    @classmethod
-    def _get_default_service(cls, gateways):
-        if gateways:
-            service_name = list(gateways.keys())[0]
-            return service_name, gateways[service_name][0]['service_url']
+
+def _get_running_daemons(orch, service_name):
+    # get the running nvmeof daemons
+    daemons = [d.to_dict()
+               for d in orch.services.list_daemons(service_name=service_name)]
+    return [d['daemon_name'] for d in daemons
+            if d['status_desc'] == 'running']
+
+
+def _get_running_daemon_svc_config(svc_config, running_daemons):
+    try:
+        return next(config for config in svc_config
+                    if config['daemon_name'] in running_daemons)
+    except StopIteration:
         return None
+
+
+def _get_default_service(gateways):
+    if gateways:
+        gateway_keys = list(gateways.keys())
+        # if there are more than 1 gateway, rather than chosing a random gateway
+        # from any of the group, raise an exception to make it clear that we need
+        # to specify the group name in the API request.
+        if len(gateway_keys) > 1:
+            raise DashboardException(
+                msg=(
+                    "Multiple NVMe-oF gateway groups are configured. "
+                    "Please specify the 'gw_group' parameter in the request."
+                ),
+                component="nvmeof"
+            )
+        service_name = gateway_keys[0]
+        return service_name, gateways[service_name][0]['service_url']
+    return None
diff --git a/src/pybind/mgr/dashboard/services/orchestrator.py b/src/pybind/mgr/dashboard/services/orchestrator.py
index 1f77b3c0ab5..38859167568 100644
--- a/src/pybind/mgr/dashboard/services/orchestrator.py
+++ b/src/pybind/mgr/dashboard/services/orchestrator.py
@@ -209,13 +209,17 @@ class CertStoreManager(ResourceManager):
 
     @wait_api_result
     def get_cert(self, entity: str, service_name: Optional[str] = None,
-                 hostname: Optional[str] = None) -> str:
-        return self.api.cert_store_get_cert(entity, service_name, hostname)
+                 hostname: Optional[str] = None,
+                 ignore_missing_exception: bool = False) -> str:
+        return self.api.cert_store_get_cert(entity, service_name, hostname,
+                                            no_exception_when_missing=ignore_missing_exception)
 
     @wait_api_result
     def get_key(self, entity: str, service_name: Optional[str] = None,
-                hostname: Optional[str] = None) -> str:
-        return self.api.cert_store_get_key(entity, service_name, hostname)
+                hostname: Optional[str] = None,
+                ignore_missing_exception: bool = False) -> str:
+        return self.api.cert_store_get_key(entity, service_name, hostname,
+                                           no_exception_when_missing=ignore_missing_exception)
 
 
 class OrchClient(object):
diff --git a/src/pybind/mgr/dashboard/services/rgw_client.py b/src/pybind/mgr/dashboard/services/rgw_client.py
index 2441b73b361..2fe09821694 100755
--- a/src/pybind/mgr/dashboard/services/rgw_client.py
+++ b/src/pybind/mgr/dashboard/services/rgw_client.py
@@ -10,6 +10,7 @@ import re
 import time
 import uuid
 import xml.etree.ElementTree as ET  # noqa: N814
+from collections import defaultdict
 from enum import Enum
 from subprocess import SubprocessError
 from urllib.parse import urlparse
@@ -288,21 +289,22 @@ class RgwClient(RestClient):
 
         daemon_keys = RgwClient._daemons.keys()
         if not daemon_name:
-            if len(daemon_keys) > 1:
-                try:
-                    multiiste = RgwMultisite()
-                    default_zonegroup = multiiste.get_all_zonegroups_info()['default_zonegroup']
-
-                    # Iterate through _daemons.values() to find the daemon with the
-                    # matching zonegroup_id
-                    for daemon in RgwClient._daemons.values():
-                        if daemon.zonegroup_id == default_zonegroup:
-                            daemon_name = daemon.name
-                            break
-                except Exception:  # pylint: disable=broad-except
-                    daemon_name = next(iter(daemon_keys))
-            else:
-                # Handle the case where there is only one or no key in _daemons
+            try:
+                if len(daemon_keys) > 1:
+                    default_zonegroup = (
+                        RgwMultisite()
+                        .get_all_zonegroups_info()['default_zonegroup']
+                    )
+                    if default_zonegroup:
+                        daemon_name = next(
+                            (daemon.name
+                             for daemon in RgwClient._daemons.values()
+                             if daemon.zonegroup_id == default_zonegroup),
+                            None
+                        )
+                daemon_name = daemon_name or next(iter(daemon_keys))
+            except Exception as e:  # pylint: disable=broad-except
+                logger.exception('Failed to determine default RGW daemon: %s', str(e))
                 daemon_name = next(iter(daemon_keys))
 
         # Discard all cached instances if any rgw setting has changed
@@ -700,12 +702,28 @@ class RgwClient(RestClient):
             raise DashboardException(msg=str(e), component='rgw')
         return result
 
+    @staticmethod
+    def _handle_rules(pairs):
+        result = defaultdict(list)
+        for key, value in pairs:
+            if key == 'Rule':
+                result['Rules'].append(value)
+            else:
+                result[key] = value
+        return result
+
     @RestClient.api_get('/{bucket_name}?lifecycle')
     def get_lifecycle(self, bucket_name, request=None):
         # pylint: disable=unused-argument
         try:
-            result = request()  # type: ignore
-            result = {'LifecycleConfiguration': result}
+            decoded_request = request(raw_content=True).decode("utf-8")  # type: ignore
+            result = {
+                'LifecycleConfiguration':
+                json.loads(
+                    decoded_request,
+                    object_pairs_hook=RgwClient._handle_rules
+                )
+            }
         except RequestException as e:
             if e.content:
                 content = json_str_to_object(e.content)
@@ -757,15 +775,15 @@ class RgwClient(RestClient):
             lifecycle = RgwClient.dict_to_xml(lifecycle)
         try:
             if lifecycle and '<LifecycleConfiguration>' not in str(lifecycle):
-                lifecycle = f'<LifecycleConfiguration>{lifecycle}</LifecycleConfiguration>'
+                lifecycle = f'<LifecycleConfiguration>\n{lifecycle}\n</LifecycleConfiguration>'
             result = request(data=lifecycle)  # type: ignore
         except RequestException as e:
+            msg = ''
             if e.content:
                 content = json_str_to_object(e.content)
                 if content.get("Code") == "MalformedXML":
                     msg = "Invalid Lifecycle document"
-                    raise DashboardException(msg=msg, component='rgw')
-            raise DashboardException(msg=str(e), component='rgw')
+            raise DashboardException(msg=msg or str(e), component='rgw')
         return result
 
     @RestClient.api_delete('/{bucket_name}?lifecycle')
@@ -1298,7 +1316,7 @@ class RgwMultisiteAutomation:
         user_found = False
         start_time = time.time()
         while not user_found:
-            if time.time() - start_time > 120:  # Timeout after 2 minutes
+            if time.time() - start_time > 300:  # Timeout after 5 minutes
                 logger.error("Timeout reached while waiting for user %s to appear \
                              in the second cluster", username)
                 raise DashboardException(code='user_replication_timeout',
@@ -1981,8 +1999,16 @@ class RgwMultisite:
             is_multisite_configured = False
         return is_multisite_configured
 
-    def get_multisite_sync_status(self):
+    def get_multisite_sync_status(self, daemon_name: str):
         rgw_multisite_sync_status_cmd = ['sync', 'status']
+        daemons = _get_daemons()
+        try:
+            realm_name = daemons[daemon_name].realm_name
+        except (KeyError, AttributeError):
+            raise DashboardException('Unable to get realm name from daemon',
+                                     http_status_code=500, component='rgw')
+        if realm_name:
+            rgw_multisite_sync_status_cmd.extend(['--rgw-realm', realm_name])
         try:
             exit_code, out, _ = mgr.send_rgwadmin_command(rgw_multisite_sync_status_cmd, False)
             if exit_code > 0:
@@ -2236,7 +2262,8 @@ class RgwMultisite:
                          source_bucket: str = '',
                          destination_bucket: str = '',
                          bucket_name: str = '',
-                         update_period=False):
+                         update_period=False,
+                         user: str = '', mode: str = ''):
 
         if source_zones['added'] or destination_zones['added']:
             rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'create',
@@ -2245,11 +2272,9 @@ class RgwMultisite:
             if bucket_name:
                 rgw_sync_policy_cmd += ['--bucket', bucket_name]
 
-            if source_bucket:
-                rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
+            rgw_sync_policy_cmd += ['--source-bucket', source_bucket]
 
-            if destination_bucket:
-                rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
+            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
 
             if source_zones['added']:
                 rgw_sync_policy_cmd += ['--source-zones', ','.join(source_zones['added'])]
@@ -2257,6 +2282,12 @@ class RgwMultisite:
             if destination_zones['added']:
                 rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones['added'])]
 
+            if user:
+                rgw_sync_policy_cmd += ['--uid', user]
+
+            if mode:
+                rgw_sync_policy_cmd += ['--mode', mode]
+
             logger.info("Creating sync pipe!")
             try:
                 exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
@@ -2271,13 +2302,13 @@ class RgwMultisite:
         if ((source_zones['removed'] and '*' not in source_zones['added'])
                 or (destination_zones['removed'] and '*' not in destination_zones['added'])):
             self.remove_sync_pipe(group_id, pipe_id, source_zones['removed'],
-                                  destination_zones['removed'], destination_bucket,
-                                  bucket_name)
+                                  destination_zones['removed'],
+                                  bucket_name, True)
 
     def remove_sync_pipe(self, group_id: str, pipe_id: str,
                          source_zones: Optional[List[str]] = None,
                          destination_zones: Optional[List[str]] = None,
-                         destination_bucket: str = '', bucket_name: str = '',
+                         bucket_name: str = '',
                          update_period=False):
         rgw_sync_policy_cmd = ['sync', 'group', 'pipe', 'remove',
                                '--group-id', group_id, '--pipe-id', pipe_id]
@@ -2291,9 +2322,6 @@ class RgwMultisite:
         if destination_zones:
             rgw_sync_policy_cmd += ['--dest-zones', ','.join(destination_zones)]
 
-        if destination_bucket:
-            rgw_sync_policy_cmd += ['--dest-bucket', destination_bucket]
-
         logger.info("Removing sync pipe! %s", rgw_sync_policy_cmd)
         try:
             exit_code, _, err = mgr.send_rgwadmin_command(rgw_sync_policy_cmd)
diff --git a/src/pybind/mgr/dashboard/services/rgw_iam.py b/src/pybind/mgr/dashboard/services/rgw_iam.py
new file mode 100644
index 00000000000..dbf00df25e0
--- /dev/null
+++ b/src/pybind/mgr/dashboard/services/rgw_iam.py
@@ -0,0 +1,24 @@
+from subprocess import SubprocessError
+from typing import List
+
+from .. import mgr
+from ..exceptions import DashboardException
+
+
+class RgwAccounts:
+    def send_rgw_cmd(self, command: List[str]):
+        try:
+            exit_code, out, err = mgr.send_rgwadmin_command(command)
+
+            if exit_code != 0:
+                raise DashboardException(msg=err,
+                                         http_status_code=500,
+                                         component='rgw')
+            return out
+
+        except SubprocessError as e:
+            raise DashboardException(e, component='rgw')
+
+    def get_accounts(self):
+        get_accounts_cmd = ['account', 'list']
+        return self.send_rgw_cmd(get_accounts_cmd)
diff --git a/src/pybind/mgr/dashboard/services/service.py b/src/pybind/mgr/dashboard/services/service.py
index 41fcc4c4446..9b789c0c859 100644
--- a/src/pybind/mgr/dashboard/services/service.py
+++ b/src/pybind/mgr/dashboard/services/service.py
@@ -101,6 +101,8 @@ def wait_for_daemon_to_start(service_name, timeout=30):
 
 
 class RgwServiceManager:
+    user = 'dashboard'
+
     def find_available_port(self, starting_port=80):
         orch = OrchClient.instance()
         daemons = [d.to_dict() for d in orch.services.list_daemons(daemon_type='rgw')]
@@ -172,7 +174,6 @@ class RgwServiceManager:
 
     def configure_rgw_credentials(self):
         logger.info('Configuring dashboard RGW credentials')
-        user = 'dashboard'
         realms = []
         access_key = ''
         secret_key = ''
@@ -186,7 +187,7 @@ class RgwServiceManager:
                 realm_access_keys = {}
                 realm_secret_keys = {}
                 for realm in realms:
-                    realm_access_key, realm_secret_key = self._get_user_keys(user, realm)
+                    realm_access_key, realm_secret_key = self._get_user_keys(self.user, realm)
                     if realm_access_key:
                         realm_access_keys[realm] = realm_access_key
                         realm_secret_keys[realm] = realm_secret_key
@@ -194,7 +195,7 @@ class RgwServiceManager:
                     access_key = json.dumps(realm_access_keys)
                     secret_key = json.dumps(realm_secret_keys)
             else:
-                access_key, secret_key = self._get_user_keys(user)
+                access_key, secret_key = self._get_user_keys(self.user)
 
             assert access_key and secret_key
             Settings.RGW_API_ACCESS_KEY = access_key
diff --git a/src/pybind/mgr/dashboard/tests/test_osd.py b/src/pybind/mgr/dashboard/tests/test_osd.py
index c3cd0dca88d..9b6dbd10de1 100644
--- a/src/pybind/mgr/dashboard/tests/test_osd.py
+++ b/src/pybind/mgr/dashboard/tests/test_osd.py
@@ -8,6 +8,7 @@ from ceph.deployment.drive_group import DeviceSelection, DriveGroupSpec  # type:
 from ceph.deployment.service_spec import PlacementSpec
 
 from .. import mgr
+from ..controllers._version import APIVersion
 from ..controllers.osd import Osd, OsdUi
 from ..services.osd import OsdDeploymentOptions
 from ..tests import ControllerTestCase
@@ -274,7 +275,7 @@ class OsdTest(ControllerTestCase):
         osds_leftover = [0, 1, 2]
         with self._mock_osd_list(osd_stat_ids=osds_actual, osdmap_tree_node_ids=osds_leftover,
                                  osdmap_ids=osds_actual):
-            self._get('/api/osd')
+            self._get('/api/osd', version=APIVersion(1, 1))
             self.assertEqual(len(self.json_body()), 2, 'It should display two OSDs without failure')
             self.assertStatus(200)
 
diff --git a/src/pybind/mgr/dashboard/tools.py b/src/pybind/mgr/dashboard/tools.py
index 51ed9c471aa..14de970cceb 100644
--- a/src/pybind/mgr/dashboard/tools.py
+++ b/src/pybind/mgr/dashboard/tools.py
@@ -9,9 +9,9 @@ import threading
 import time
 import urllib
 from datetime import datetime, timedelta
-from distutils.util import strtobool
 
 import cherrypy
+from ceph.utils import strtobool
 from mgr_util import build_url
 
 from . import mgr
diff --git a/src/pybind/mgr/mgr_util.py b/src/pybind/mgr/mgr_util.py
index 67246545eea..5d37d478de7 100644
--- a/src/pybind/mgr/mgr_util.py
+++ b/src/pybind/mgr/mgr_util.py
@@ -22,6 +22,7 @@ import sys
 from ipaddress import ip_address
 from threading import Lock, Condition
 from typing import no_type_check, NewType
+from traceback import format_exc as tb_format_exc
 import urllib
 from functools import wraps
 if sys.version_info >= (3, 3):
@@ -88,9 +89,9 @@ class RTimer(Timer):
             while not self.finished.is_set():
                 self.finished.wait(self.interval)
                 self.function(*self.args, **self.kwargs)
-            self.finished.set()
-        except Exception as e:
-            logger.error("task exception: %s", e)
+        except Exception:
+            logger.error(f'exception encountered in RTimer instance "{self}":'
+                         f'\n{tb_format_exc()}')
             raise
 
 
diff --git a/src/pybind/mgr/mirroring/fs/snapshot_mirror.py b/src/pybind/mgr/mirroring/fs/snapshot_mirror.py
index 2bfb6482674..c348ce82de1 100644
--- a/src/pybind/mgr/mirroring/fs/snapshot_mirror.py
+++ b/src/pybind/mgr/mirroring/fs/snapshot_mirror.py
@@ -722,6 +722,20 @@ class FSSnapshotMirror:
         except Exception as e:
             return e.args[0], '', 'failed to remove directory'
 
+    def list_dirs(self, filesystem):
+        try:
+            with self.lock:
+                if not self.filesystem_exist(filesystem):
+                    raise MirrorException(-errno.ENOENT, f'filesystem {filesystem} does not exist')
+                fspolicy = self.pool_policy.get(filesystem, None)
+                if not fspolicy:
+                    raise MirrorException(-errno.EINVAL, f'filesystem {filesystem} is not mirrored')
+                return 0, json.dumps(list(fspolicy.policy.dir_states.keys()), indent=4, sort_keys=True), ''
+        except MirrorException as me:
+            return me.args[0], '', me.args[1]
+        except Exception as e:
+            return e.args[0], '', 'failed to list directories'
+
     def status(self,filesystem, dir_path):
         try:
             with self.lock:
diff --git a/src/pybind/mgr/mirroring/module.py b/src/pybind/mgr/mirroring/module.py
index 4b4354ab2b9..67f0942147e 100644
--- a/src/pybind/mgr/mirroring/module.py
+++ b/src/pybind/mgr/mirroring/module.py
@@ -84,6 +84,12 @@ class Module(MgrModule):
         """Remove a snapshot mirrored directory"""
         return self.fs_snapshot_mirror.remove_dir(fs_name, path)
 
+    @CLIWriteCommand('fs snapshot mirror ls')
+    def snapshot_mirror_ls(self,
+                           fs_name: str):
+        """List the snapshot mirrored directories"""
+        return self.fs_snapshot_mirror.list_dirs(fs_name)
+
     @CLIReadCommand('fs snapshot mirror dirmap')
     def snapshot_mirror_dirmap(self,
                                fs_name: str,
diff --git a/src/pybind/mgr/nfs/export.py b/src/pybind/mgr/nfs/export.py
index 3ba75e60b5c..aff6779bb16 100644
--- a/src/pybind/mgr/nfs/export.py
+++ b/src/pybind/mgr/nfs/export.py
@@ -13,8 +13,10 @@ from typing import (
     Set,
     cast)
 from os.path import normpath
+from ceph.fs.earmarking import EarmarkTopScope
 import cephfs
 
+from mgr_util import CephFSEarmarkResolver
 from rados import TimedOut, ObjectNotFound, Rados
 
 from object_format import ErrorResponse
@@ -535,7 +537,8 @@ class ExportMgr:
 
     # This method is used by the dashboard module (../dashboard/controllers/nfs.py)
     # Do not change interface without updating the Dashboard code
-    def apply_export(self, cluster_id: str, export_config: str) -> AppliedExportResults:
+    def apply_export(self, cluster_id: str, export_config: str,
+                     earmark_resolver: Optional[CephFSEarmarkResolver] = None) -> AppliedExportResults:
         try:
             exports = self._read_export_config(cluster_id, export_config)
         except Exception as e:
@@ -544,7 +547,7 @@ class ExportMgr:
 
         aeresults = AppliedExportResults()
         for export in exports:
-            changed_export = self._change_export(cluster_id, export)
+            changed_export = self._change_export(cluster_id, export, earmark_resolver)
             # This will help figure out which export blocks in conf/json file
             # are problematic.
             if changed_export.get("state", "") == "error":
@@ -573,9 +576,10 @@ class ExportMgr:
             return j  # j is already a list object
         return [j]  # return a single object list, with j as the only item
 
-    def _change_export(self, cluster_id: str, export: Dict) -> Dict[str, Any]:
+    def _change_export(self, cluster_id: str, export: Dict,
+                       earmark_resolver: Optional[CephFSEarmarkResolver] = None) -> Dict[str, Any]:
         try:
-            return self._apply_export(cluster_id, export)
+            return self._apply_export(cluster_id, export, earmark_resolver)
         except NotImplementedError:
             # in theory, the NotImplementedError here may be raised by a hook back to
             # an orchestration module. If the orchestration module supports it the NFS
@@ -651,10 +655,34 @@ class ExportMgr:
         log.info(f"Export user created is {json_res[0]['entity']}")
         return json_res[0]['key']
 
+    def _check_earmark(self, earmark_resolver: CephFSEarmarkResolver, path: str,
+                       fs_name: str) -> None:
+        earmark = earmark_resolver.get_earmark(
+            path,
+            fs_name,
+        )
+        if not earmark:
+            earmark_resolver.set_earmark(
+                path,
+                fs_name,
+                EarmarkTopScope.NFS.value,
+            )
+        else:
+            if not earmark_resolver.check_earmark(
+                earmark, EarmarkTopScope.NFS
+            ):
+                raise NFSException(
+                    'earmark has already been set by ' + earmark.split('.')[0],
+                    -errno.EAGAIN
+                )
+        return None
+
     def create_export_from_dict(self,
                                 cluster_id: str,
                                 ex_id: int,
-                                ex_dict: Dict[str, Any]) -> Export:
+                                ex_dict: Dict[str, Any],
+                                earmark_resolver: Optional[CephFSEarmarkResolver] = None
+                                ) -> Export:
         pseudo_path = ex_dict.get("pseudo")
         if not pseudo_path:
             raise NFSInvalidOperation("export must specify pseudo path")
@@ -677,6 +705,11 @@ class ExportMgr:
                 raise FSNotFound(fs_name)
 
             validate_cephfs_path(self.mgr, fs_name, path)
+
+            # Check if earmark is set for the path, given path is of subvolume
+            if earmark_resolver:
+                self._check_earmark(earmark_resolver, path, fs_name)
+
             if fsal["cmount_path"] != "/":
                 _validate_cmount_path(fsal["cmount_path"], path)  # type: ignore
 
@@ -707,7 +740,9 @@ class ExportMgr:
                              access_type: str,
                              clients: list = [],
                              sectype: Optional[List[str]] = None,
-                             cmount_path: Optional[str] = "/") -> Dict[str, Any]:
+                             cmount_path: Optional[str] = "/",
+                             earmark_resolver: Optional[CephFSEarmarkResolver] = None
+                             ) -> Dict[str, Any]:
 
         validate_cephfs_path(self.mgr, fs_name, path)
         if cmount_path != "/":
@@ -731,7 +766,8 @@ class ExportMgr:
                     },
                     "clients": clients,
                     "sectype": sectype,
-                }
+                },
+                earmark_resolver
             )
             log.debug("creating cephfs export %s", export)
             self._ensure_cephfs_export_user(export)
@@ -795,6 +831,7 @@ class ExportMgr:
             self,
             cluster_id: str,
             new_export_dict: Dict,
+            earmark_resolver: Optional[CephFSEarmarkResolver] = None
     ) -> Dict[str, str]:
         for k in ['path', 'pseudo']:
             if k not in new_export_dict:
@@ -834,7 +871,8 @@ class ExportMgr:
         new_export = self.create_export_from_dict(
             cluster_id,
             new_export_dict.get('export_id', self._gen_export_id(cluster_id)),
-            new_export_dict
+            new_export_dict,
+            earmark_resolver
         )
 
         if not old_export:
diff --git a/src/pybind/mgr/nfs/module.py b/src/pybind/mgr/nfs/module.py
index be43112f396..80490ac8e7f 100644
--- a/src/pybind/mgr/nfs/module.py
+++ b/src/pybind/mgr/nfs/module.py
@@ -6,6 +6,7 @@ from mgr_module import MgrModule, CLICommand, Option, CLICheckNonemptyFileInput
 import object_format
 import orchestrator
 from orchestrator.module import IngressType
+from mgr_util import CephFSEarmarkResolver
 
 from .export import ExportMgr, AppliedExportResults
 from .cluster import NFSCluster
@@ -41,6 +42,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             cmount_path: Optional[str] = "/"
     ) -> Dict[str, Any]:
         """Create a CephFS export"""
+        earmark_resolver = CephFSEarmarkResolver(self)
         return self.export_mgr.create_export(
             fsal_type='cephfs',
             fs_name=fsname,
@@ -51,7 +53,8 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             squash=squash,
             addr=client_addr,
             sectype=sectype,
-            cmount_path=cmount_path
+            cmount_path=cmount_path,
+            earmark_resolver=earmark_resolver
         )
 
     @CLICommand('nfs export create rgw', perm='rw')
@@ -114,8 +117,10 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
     @CLICheckNonemptyFileInput(desc='Export JSON or Ganesha EXPORT specification')
     @object_format.Responder()
     def _cmd_nfs_export_apply(self, cluster_id: str, inbuf: str) -> AppliedExportResults:
+        earmark_resolver = CephFSEarmarkResolver(self)
         """Create or update an export by `-i <json_or_ganesha_export_file>`"""
-        return self.export_mgr.apply_export(cluster_id, export_config=inbuf)
+        return self.export_mgr.apply_export(cluster_id, export_config=inbuf,
+                                            earmark_resolver=earmark_resolver)
 
     @CLICommand('nfs cluster create', perm='rw')
     @object_format.EmptyResponder()
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index 82a8c13a9c1..d5c351fda7e 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -520,6 +520,15 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
+    def replace_device(self,
+                       hostname: str,
+                       device: str,
+                       clear: bool = False,
+                       yes_i_really_mean_it: bool = False) -> OrchResult:
+        """Perform all required operations in order to replace a device.
+        """
+        raise NotImplementedError()
+
     def get_inventory(self, host_filter: Optional['InventoryFilter'] = None, refresh: bool = False) -> OrchResult[List['InventoryHost']]:
         """
         Returns something that was created by `ceph-volume inventory`.
@@ -576,7 +585,12 @@ class Orchestrator(object):
         raise NotImplementedError()
 
     @handle_orch_error
-    def apply(self, specs: Sequence["GenericSpec"], no_overwrite: bool = False) -> List[str]:
+    def apply(
+        self,
+        specs: Sequence["GenericSpec"],
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> List[str]:
         """
         Applies any spec
         """
@@ -699,12 +713,18 @@ class Orchestrator(object):
 
     def remove_osds(self, osd_ids: List[str],
                     replace: bool = False,
+                    replace_block: bool = False,
+                    replace_db: bool = False,
+                    replace_wal: bool = False,
                     force: bool = False,
                     zap: bool = False,
                     no_destroy: bool = False) -> OrchResult[str]:
         """
         :param osd_ids: list of OSD IDs
         :param replace: marks the OSD as being destroyed. See :ref:`orchestrator-osd-replace`
+        :param replace_block: marks the corresponding block device as being replaced.
+        :param replace_db: marks the corresponding db device as being replaced.
+        :param replace_wal: marks the corresponding wal device as being replaced.
         :param force: Forces the OSD removal process without waiting for the data to be drained first.
         :param zap: Zap/Erase all devices associated with the OSDs (DESTROYS DATA)
         :param no_destroy: Do not destroy associated VGs/LVs with the OSD.
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index be0096bb2d9..dbfa10fb720 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -818,6 +818,21 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
             return HandleCommandResult(stdout=completion.result_str())
         return HandleCommandResult(stdout=completion.result_str().split('.')[0])
 
+    @_cli_read_command('orch device replace')
+    def _replace_device(self,
+                        hostname: str,
+                        device: str,
+                        clear: bool = False,
+                        yes_i_really_mean_it: bool = False) -> HandleCommandResult:
+        """Perform all required operations in order to replace a device.
+        """
+        completion = self.replace_device(hostname=hostname,
+                                         device=device,
+                                         clear=clear,
+                                         yes_i_really_mean_it=yes_i_really_mean_it)
+        raise_if_exception(completion)
+        return HandleCommandResult(stdout=completion.result_str())
+
     @_cli_read_command('orch device ls')
     def _list_devices(self,
                       hostname: Optional[List[str]] = None,
@@ -1415,8 +1430,9 @@ Usage:
                       zap: bool = False,
                       no_destroy: bool = False) -> HandleCommandResult:
         """Remove OSD daemons"""
-        completion = self.remove_osds(osd_id, replace=replace, force=force,
-                                      zap=zap, no_destroy=no_destroy)
+        completion = self.remove_osds(osd_id,
+                                      replace=replace,
+                                      force=force, zap=zap, no_destroy=no_destroy)
         raise_if_exception(completion)
         return HandleCommandResult(stdout=completion.result_str())
 
@@ -1635,12 +1651,14 @@ Usage:
                    format: Format = Format.plain,
                    unmanaged: bool = False,
                    no_overwrite: bool = False,
+                   continue_on_error: bool = False,
                    inbuf: Optional[str] = None) -> HandleCommandResult:
         """Update the size or placement for a service or apply a large yaml spec"""
         usage = """Usage:
   ceph orch apply -i <yaml spec> [--dry-run]
   ceph orch apply <service_type> [--placement=<placement_string>] [--unmanaged]
         """
+        errs: List[str] = []
         if inbuf:
             if service_type or placement or unmanaged:
                 raise OrchestratorValidationError(usage)
@@ -1650,7 +1668,14 @@ Usage:
             # None entries in the output. Let's skip them silently.
             content = [o for o in yaml_objs if o is not None]
             for s in content:
-                spec = json_to_generic_spec(s)
+                try:
+                    spec = json_to_generic_spec(s)
+                except Exception as e:
+                    if continue_on_error:
+                        errs.append(f'Failed to convert {s} from json object: {str(e)}')
+                        continue
+                    else:
+                        raise e
 
                 # validate the config (we need MgrModule for that)
                 if isinstance(spec, ServiceSpec) and spec.config:
@@ -1658,7 +1683,12 @@ Usage:
                         try:
                             self.get_foreign_ceph_option('mon', k)
                         except KeyError:
-                            raise SpecValidationError(f'Invalid config option {k} in spec')
+                            err = SpecValidationError(f'Invalid config option {k} in spec')
+                            if continue_on_error:
+                                errs.append(str(err))
+                                continue
+                            else:
+                                raise err
 
                 # There is a general "osd" service with no service id, but we use
                 # that to dump osds created individually with "ceph orch daemon add osd"
@@ -1673,7 +1703,12 @@ Usage:
                     and spec.service_type == 'osd'
                     and not spec.service_id
                 ):
-                    raise SpecValidationError('Please provide the service_id field in your OSD spec')
+                    err = SpecValidationError('Please provide the service_id field in your OSD spec')
+                    if continue_on_error:
+                        errs.append(str(err))
+                        continue
+                    else:
+                        raise err
 
                 if dry_run and not isinstance(spec, HostSpec):
                     spec.preview_only = dry_run
@@ -1683,15 +1718,30 @@ Usage:
                     continue
                 specs.append(spec)
         else:
+            # Note in this case there is only ever one spec
+            # being applied so there is no need to worry about
+            # handling of continue_on_error
             placementspec = PlacementSpec.from_string(placement)
             if not service_type:
                 raise OrchestratorValidationError(usage)
             specs = [ServiceSpec(service_type.value, placement=placementspec,
                                  unmanaged=unmanaged, preview_only=dry_run)]
-        return self._apply_misc(specs, dry_run, format, no_overwrite)
-
-    def _apply_misc(self, specs: Sequence[GenericSpec], dry_run: bool, format: Format, no_overwrite: bool = False) -> HandleCommandResult:
-        completion = self.apply(specs, no_overwrite)
+        cmd_result = self._apply_misc(specs, dry_run, format, no_overwrite, continue_on_error)
+        if errs:
+            # HandleCommandResult is a named tuple, so use
+            # _replace to modify it.
+            cmd_result = cmd_result._replace(stdout=cmd_result.stdout + '\n' + '\n'.join(errs))
+        return cmd_result
+
+    def _apply_misc(
+        self,
+        specs: Sequence[GenericSpec],
+        dry_run: bool,
+        format: Format,
+        no_overwrite: bool = False,
+        continue_on_error: bool = False
+    ) -> HandleCommandResult:
+        completion = self.apply(specs, no_overwrite, continue_on_error)
         raise_if_exception(completion)
         out = completion.result_str()
         if dry_run:
diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
index 726a7ac7937..3247b06a399 100644
--- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
+++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py
@@ -102,7 +102,7 @@ placement:
   host_pattern: '*'
 status:
   container_image_id: 74803e884bea289d2d2d3ebdf6d37cd560499e955595695b1390a89800f4e37a
-  container_image_name: docker.io/ceph/daemon-base:latest-master-devel
+  container_image_name: quay.io/ceph/daemon-base:latest-main-devel
   created: '2020-06-10T10:37:31.051288Z'
   last_refresh: '2020-06-10T10:57:40.715637Z'
   running: 1
diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py
index dea45f951f8..3e8544f43cf 100644
--- a/src/pybind/mgr/smb/enums.py
+++ b/src/pybind/mgr/smb/enums.py
@@ -21,7 +21,7 @@ class CephFSStorageProvider(_StrEnum):
 
     def expand(self) -> 'CephFSStorageProvider':
         """Expand abbreviated/default values into the full/expanded form."""
-        if self == self.SAMBA_VFS:
+        if self is self.SAMBA_VFS:
             # mypy gets confused by enums
             return self.__class__(self.SAMBA_VFS_NEW)
         return self
@@ -89,9 +89,9 @@ class LoginAccess(_StrEnum):
     def expand(self) -> 'LoginAccess':
         """Exapend abbreviated enum values into their full forms."""
         # the extra LoginAccess(...) calls are to appease mypy
-        if self == self.READ_ONLY_SHORT:
+        if self is self.READ_ONLY_SHORT:
             return LoginAccess(self.READ_ONLY)
-        if self == self.READ_WRITE_SHORT:
+        if self is self.READ_WRITE_SHORT:
             return LoginAccess(self.READ_WRITE)
         return self
 
diff --git a/src/pybind/mgr/smb/handler.py b/src/pybind/mgr/smb/handler.py
index 670cb15a587..7b993d5b60d 100644
--- a/src/pybind/mgr/smb/handler.py
+++ b/src/pybind/mgr/smb/handler.py
@@ -834,6 +834,19 @@ def _check_cluster(cluster: ClusterRef, staging: _Staging) -> None:
             )
 
 
+def _parse_earmark(earmark: str) -> dict:
+    parts = earmark.split('.')
+
+    # If it only has one part (e.g., 'smb'), return None for cluster_id
+    if len(parts) == 1:
+        return {'scope': parts[0], 'cluster_id': None}
+
+    return {
+        'scope': parts[0],
+        'cluster_id': parts[2] if len(parts) > 2 else None,
+    }
+
+
 def _check_share(
     share: ShareRef,
     staging: _Staging,
@@ -878,19 +891,28 @@ def _check_share(
                 smb_earmark,
             )
         else:
+            parsed_earmark = _parse_earmark(earmark)
+
+            # Check if the top-level scope is not SMB
             if not earmark_resolver.check_earmark(
                 earmark, EarmarkTopScope.SMB
             ):
                 raise ErrorResult(
                     share,
-                    msg=f"earmark has already been set by {earmark.split('.')[0]}",
+                    msg=f"earmark has already been set by {parsed_earmark['scope']}",
                 )
-            # Check if earmark is set by same cluster
-            if earmark.split('.')[2] != share.cluster_id:
+
+            # Check if the earmark is set by a different cluster
+            if (
+                parsed_earmark['cluster_id']
+                and parsed_earmark['cluster_id'] != share.cluster_id
+            ):
                 raise ErrorResult(
                     share,
-                    msg=f"earmark has already been set by smb cluster {earmark.split('.')[2]}",
+                    msg="earmark has already been set by smb cluster "
+                    f"{parsed_earmark['cluster_id']}",
                 )
+
     name_used_by = _share_name_in_use(staging, share)
     if name_used_by:
         raise ErrorResult(
diff --git a/src/pybind/mgr/smb/module.py b/src/pybind/mgr/smb/module.py
index 7483eb7964b..4512ad6add3 100644
--- a/src/pybind/mgr/smb/module.py
+++ b/src/pybind/mgr/smb/module.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast
+from typing import TYPE_CHECKING, Any, List, Optional, cast
 
 import logging
 
@@ -171,6 +171,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
         custom_dns: Optional[List[str]] = None,
         placement: Optional[str] = None,
         clustering: Optional[SMBClustering] = None,
+        public_addrs: Optional[List[str]] = None,
     ) -> results.Result:
         """Create an smb cluster"""
         domain_settings = None
@@ -255,6 +256,18 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
                 )
             )
 
+        c_public_addrs = []
+        if public_addrs:
+            for pa in public_addrs:
+                pa_arr = pa.split('%', 1)
+                address = pa_arr[0]
+                destination = pa_arr[1] if len(pa_arr) > 1 else None
+                c_public_addrs.append(
+                    resources.ClusterPublicIPAssignment(
+                        address=address, destination=destination
+                    )
+                )
+
         pspec = resources.WrappedPlacementSpec.wrap(
             PlacementSpec.from_string(placement)
         )
@@ -266,6 +279,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             custom_dns=custom_dns,
             placement=pspec,
             clustering=clustering,
+            public_addrs=c_public_addrs,
         )
         to_apply.append(cluster)
         return self._handler.apply(to_apply, create_only=True).squash(cluster)
@@ -336,45 +350,6 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
             return resources[0].to_simplified()
         return {'resources': [r.to_simplified() for r in resources]}
 
-    @cli.SMBCommand('dump cluster-config', perm='r')
-    def dump_config(self, cluster_id: str) -> Dict[str, Any]:
-        """DEBUG: Generate an example configuration"""
-        # TODO: Remove this command prior to release
-        return self._handler.generate_config(cluster_id)
-
-    @cli.SMBCommand('dump service-spec', perm='r')
-    def dump_service_spec(self, cluster_id: str) -> Dict[str, Any]:
-        """DEBUG: Generate an example smb service spec"""
-        # TODO: Remove this command prior to release
-        return dict(
-            self._handler.generate_smb_service_spec(cluster_id).to_json()
-        )
-
-    @cli.SMBCommand('dump everything', perm='r')
-    def dump_everything(self) -> Dict[str, Any]:
-        """DEBUG: Show me everything"""
-        # TODO: Remove this command prior to release
-        everything: Dict[str, Any] = {}
-        everything['PUBLIC'] = {}
-        log.warning('dumping PUBLIC')
-        for key in self._public_store:
-            e = self._public_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['PUBLIC'][e.uri] = e.get()
-        log.warning('dumping PRIV')
-        everything['PRIV'] = {}
-        for key in self._priv_store:
-            e = self._priv_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['PRIV'][e.uri] = e.get()
-        log.warning('dumping INTERNAL')
-        everything['INTERNAL'] = {}
-        for key in self._internal_store:
-            e = self._internal_store[key]
-            log.warning('dumping e: %s %r', e.uri, e.full_key)
-            everything['INTERNAL'][e.uri] = e.get()
-        return everything
-
     def submit_smb_spec(self, spec: SMBSpec) -> None:
         """Submit a new or updated smb spec object to ceph orchestration."""
         completion = self.apply_smb(spec)
diff --git a/src/pybind/mgr/smb/tests/test_smb.py b/src/pybind/mgr/smb/tests/test_smb.py
index c9fd02968b9..0d3610326c2 100644
--- a/src/pybind/mgr/smb/tests/test_smb.py
+++ b/src/pybind/mgr/smb/tests/test_smb.py
@@ -410,72 +410,6 @@ def test_cmd_apply_share(tmodule):
     assert bdata["results"][0]["state"] == "created"
 
 
-def test_share_dump_config(tmodule):
-    _example_cfg_1(tmodule)
-
-    cfg = tmodule.dump_config('foo')
-    assert cfg == {
-        'samba-container-config': "v0",
-        'configs': {
-            'foo': {
-                'instance_name': 'foo',
-                'instance_features': [],
-                'shares': ['Ess One', 'Ess Two'],
-                'globals': ['default', 'foo'],
-            },
-        },
-        'shares': {
-            'Ess One': {
-                'options': {
-                    'path': '/',
-                    'read only': 'No',
-                    'browseable': 'Yes',
-                    'kernel share modes': 'no',
-                    'x:ceph:id': 'foo.s1',
-                    'vfs objects': 'acl_xattr ceph_new',
-                    'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
-                    'ceph_new:filesystem': 'cephfs',
-                    'ceph_new:user_id': 'smb.fs.cluster.foo',
-                },
-            },
-            'Ess Two': {
-                'options': {
-                    'path': '/two',
-                    'read only': 'No',
-                    'browseable': 'Yes',
-                    'kernel share modes': 'no',
-                    'x:ceph:id': 'foo.stwo',
-                    'vfs objects': 'acl_xattr ceph_new',
-                    'acl_xattr:security_acl_name': 'user.NTACL',
-                    'ceph_new:config_file': '/etc/ceph/ceph.conf',
-                    'ceph_new:filesystem': 'cephfs',
-                    'ceph_new:user_id': 'smb.fs.cluster.foo',
-                },
-            },
-        },
-        'globals': {
-            'default': {
-                'options': {
-                    'load printers': 'No',
-                    'printing': 'bsd',
-                    'printcap name': '/dev/null',
-                    'disable spoolss': 'Yes',
-                },
-            },
-            'foo': {
-                'options': {
-                    'idmap config * : backend': 'autorid',
-                    'idmap config * : range': '2000-9999999',
-                    'realm': 'dom1.example.com',
-                    'security': 'ads',
-                    'workgroup': 'DOM1',
-                },
-            },
-        },
-    }
-
-
 def test_cluster_create_ad1(tmodule):
     _example_cfg_1(tmodule)
 
@@ -613,29 +547,6 @@ def test_cluster_rm(tmodule):
     assert result.success
 
 
-def test_dump_service_spec(tmodule):
-    _example_cfg_1(tmodule)
-    tmodule._public_store.overwrite(
-        {
-            'foo.config.smb': '',
-        }
-    )
-    tmodule._priv_store.overwrite(
-        {
-            'foo.join.2b9902c05d08bcba.json': '',
-            'foo.join.08129d4d3b8c37c7.json': '',
-        }
-    )
-
-    cfg = tmodule.dump_service_spec('foo')
-    assert cfg
-    assert cfg['service_id'] == 'foo'
-    assert cfg['spec']['cluster_id'] == 'foo'
-    assert cfg['spec']['features'] == ['domain']
-    assert cfg['spec']['config_uri'] == 'mem:foo/config.smb'
-    assert len(cfg['spec']['join_sources']) == 2
-
-
 def test_cmd_show_resource_json(tmodule):
     _example_cfg_1(tmodule)
 
diff --git a/src/pybind/mgr/status/module.py b/src/pybind/mgr/status/module.py
index 85e65266a55..2b59132c1cb 100644
--- a/src/pybind/mgr/status/module.py
+++ b/src/pybind/mgr/status/module.py
@@ -161,7 +161,7 @@ class Module(MgrModule):
 
                 if output_format in ('json', 'json-pretty'):
                     json_output['mdsmap'].append({
-                        'rank': rank,
+                        'rank': f"{daemon_info['rank']}-s",
                         'name': daemon_info['name'],
                         'state': 'standby-replay',
                         'events': events,
diff --git a/src/pybind/mgr/telemetry/tox.ini b/src/pybind/mgr/telemetry/tox.ini
index a887590eed8..b2210da54ea 100644
--- a/src/pybind/mgr/telemetry/tox.ini
+++ b/src/pybind/mgr/telemetry/tox.ini
@@ -1,7 +1,6 @@
 [tox]
 envlist =
     py3
-    mypy
 skipsdist = true
 
 [testenv]
diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini
index a8a2d39d01a..f39ececa93d 100644
--- a/src/pybind/mgr/tox.ini
+++ b/src/pybind/mgr/tox.ini
@@ -160,7 +160,8 @@ modules =
 commands =
     flake8 --config=tox.ini {posargs} \
       {posargs:{[testenv:flake8]modules}}
-    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 13'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "docker.io" | wc -l) == 3'
+    bash -c 'test $(git ls-files cephadm | grep ".py$" | grep -v tests | xargs grep "quay.io" | wc -l) == 26'
 
 [testenv:jinjalint]
 deps =
diff --git a/src/pybind/mgr/volumes/fs/async_cloner.py b/src/pybind/mgr/volumes/fs/async_cloner.py
index 463c1000596..1525f57c3f8 100644
--- a/src/pybind/mgr/volumes/fs/async_cloner.py
+++ b/src/pybind/mgr/volumes/fs/async_cloner.py
@@ -313,6 +313,8 @@ class Cloner(AsyncJobs):
     the driver. file types supported are directories, symbolic links and regular files.
     """
     def __init__(self, volume_client, tp_size, snapshot_clone_delay, clone_no_wait):
+        super(Cloner, self).__init__(volume_client, "cloner", tp_size)
+
         self.vc = volume_client
         self.snapshot_clone_delay = snapshot_clone_delay
         self.snapshot_clone_no_wait = clone_no_wait
@@ -323,7 +325,6 @@ class Cloner(AsyncJobs):
             SubvolumeStates.STATE_FAILED       : handle_clone_failed,
             SubvolumeStates.STATE_CANCELED     : handle_clone_failed,
         }
-        super(Cloner, self).__init__(volume_client, "cloner", tp_size)
 
     def reconfigure_max_concurrent_clones(self, tp_size):
         return super(Cloner, self).reconfigure_max_async_threads(tp_size)
diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py
index 6834e3e240b..075fedf20a4 100644
--- a/src/pybind/mgr/volumes/fs/async_job.py
+++ b/src/pybind/mgr/volumes/fs/async_job.py
@@ -19,11 +19,12 @@ class JobThread(threading.Thread):
     MAX_RETRIES_ON_EXCEPTION = 10
 
     def __init__(self, async_job, volume_client, name):
+        threading.Thread.__init__(self, name=name)
+
         self.vc = volume_client
         self.async_job = async_job
         # event object to cancel jobs
         self.cancel_event = threading.Event()
-        threading.Thread.__init__(self, name=name)
 
     def run(self):
         retries = 0
@@ -117,16 +118,21 @@ class AsyncJobs(threading.Thread):
 
     def __init__(self, volume_client, name_pfx, nr_concurrent_jobs):
         threading.Thread.__init__(self, name="{0}.tick".format(name_pfx))
+
         self.vc = volume_client
-        # queue of volumes for starting async jobs
+        # self.q is a deque of names of a volumes for which async jobs needs
+        # to be started.
         self.q = deque()  # type: deque
-        # volume => job tracking
+
+        # self.jobs is a dictionary where volume name is the key and value is
+        # a tuple containing two members: the async job and an instance of
+        # threading.Thread that performs that job.
+        # in short, self.jobs = {volname: (async_job, thread instance)}.
         self.jobs = {}
+
         # lock, cv for kickstarting jobs
         self.lock = threading.Lock()
         self.cv = threading.Condition(self.lock)
-        # cv for job cancelation
-        self.waiting = False
         self.stopping = threading.Event()
         self.cancel_cv = threading.Condition(self.lock)
         self.nr_concurrent_jobs = nr_concurrent_jobs
@@ -136,11 +142,31 @@ class AsyncJobs(threading.Thread):
         self.wakeup_timeout = None
 
         self.threads = []
-        for i in range(self.nr_concurrent_jobs):
-            self.threads.append(JobThread(self, volume_client, name="{0}.{1}".format(self.name_pfx, i)))
-            self.threads[-1].start()
+        self.spawn_all_threads()
         self.start()
 
+    def spawn_new_thread(self, suffix):
+        t_name = f'{self.name_pfx}.{time.time()}.{suffix}'
+        log.debug(f'spawning new thread with name {t_name}')
+        t = JobThread(self, self.vc, name=t_name)
+        t.start()
+
+        self.threads.append(t)
+
+    def spawn_all_threads(self):
+        log.debug(f'spawning {self.nr_concurrent_jobs} to execute more jobs '
+                  'concurrently')
+        for i in range(self.nr_concurrent_jobs):
+            self.spawn_new_thread(i)
+
+    def spawn_more_threads(self):
+        c = len(self.threads)
+        diff = self.nr_concurrent_jobs - c
+        log.debug(f'spawning {diff} threads to execute more jobs concurrently')
+
+        for i in range(c, self.nr_concurrent_jobs):
+            self.spawn_new_thread(i)
+
     def set_wakeup_timeout(self):
         with self.lock:
             # not made configurable on purpose
@@ -163,11 +189,8 @@ class AsyncJobs(threading.Thread):
                     self.cv.notifyAll()
                 elif c < self.nr_concurrent_jobs:
                     # Increase concurrency: create more threads.
-                    log.debug("creating new threads to job increase")
-                    for i in range(c, self.nr_concurrent_jobs):
-                        self.threads.append(JobThread(self, self.vc, name="{0}.{1}.{2}".format(self.name_pfx, time.time(), i)))
-                        self.threads[-1].start()
-                self.cv.wait(timeout=5)
+                    self.spawn_more_threads()
+                self.cv.wait(timeout=self.wakeup_timeout)
 
     def shutdown(self):
         self.stopping.set()
diff --git a/src/pybind/mgr/volumes/fs/operations/pin_util.py b/src/pybind/mgr/volumes/fs/operations/pin_util.py
index a12ab5b4d4b..631fdd8fcaa 100644
--- a/src/pybind/mgr/volumes/fs/operations/pin_util.py
+++ b/src/pybind/mgr/volumes/fs/operations/pin_util.py
@@ -3,7 +3,7 @@ import errno
 import cephfs
 
 from ..exception import VolumeException
-from distutils.util import strtobool
+from ceph.utils import strtobool
 
 _pin_value = {
     "export": lambda x: int(x),
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
index 610a61e6a4c..146d6d3f453 100644
--- a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
+++ b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py
@@ -172,7 +172,7 @@ class MetadataManager(object):
                 metadata_dict[option] = self.config.get(section,option)
         return metadata_dict
 
-    def list_all_keys_with_specified_values_from_section(self, section, value):
+    def filter_keys(self, section, value):
         keys = []
         if self.config.has_section(section):
             options = self.config.options(section)
diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
index 33d364b8b45..72209ca61b5 100644
--- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
+++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py
@@ -758,7 +758,7 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate):
 
         try:
             if self.has_pending_clones(snapname):
-                pending_track_id_list = self.metadata_mgr.list_all_keys_with_specified_values_from_section('clone snaps', snapname)
+                pending_track_id_list = self.metadata_mgr.filter_keys('clone snaps', snapname)
             else:
                 return pending_clones_info
         except MetadataMgrException as me:
@@ -780,9 +780,9 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate):
                     raise VolumeException(-e.args[0], e.args[1])
                 else:
                     try:
-                        # If clone is completed between 'list_all_keys_with_specified_values_from_section'
-                        # and readlink(track_id_path) call then readlink will fail with error ENOENT (2)
-                        # Hence we double check whether track_id is exist in .meta file or not.
+                        # If clone is completed between 'filter_keys' and readlink(track_id_path) call
+                        # then readlink will fail with error ENOENT (2). Hence we double check whether
+                        # track_id exists in .meta file or not.
                         # Edge case scenario.
                         # If track_id for clone exist but path /volumes/_index/clone/{track_id} not found
                         # then clone is orphan.
diff --git a/src/pybind/mgr/volumes/fs/purge_queue.py b/src/pybind/mgr/volumes/fs/purge_queue.py
index abace19d029..8917b475ac6 100644
--- a/src/pybind/mgr/volumes/fs/purge_queue.py
+++ b/src/pybind/mgr/volumes/fs/purge_queue.py
@@ -103,9 +103,10 @@ class ThreadPoolPurgeQueueMixin(AsyncJobs):
     _all_ threads purging entries for one volume (starving other volumes).
     """
     def __init__(self, volume_client, tp_size):
-        self.vc = volume_client
         super(ThreadPoolPurgeQueueMixin, self).__init__(volume_client, "purgejob", tp_size)
 
+        self.vc = volume_client
+
     def get_next_job(self, volname, running_jobs):
         return get_trash_entry_for_volume(self.fs_client, self.vc.volspec, volname, running_jobs)
 
diff --git a/src/pybind/mgr/volumes/fs/stats_util.py b/src/pybind/mgr/volumes/fs/stats_util.py
index cec33eaa887..3334dc5a3d7 100644
--- a/src/pybind/mgr/volumes/fs/stats_util.py
+++ b/src/pybind/mgr/volumes/fs/stats_util.py
@@ -106,6 +106,11 @@ class CloneProgressReporter:
         # reporting has already been initiated by calling RTimer.is_alive().
         self.update_task = RTimer(1, self._update_progress_bars)
 
+        # progress event ID for ongoing clone jobs
+        self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones'
+        # progress event ID for ongoing+pending clone jobs
+        self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones'
+
     def initiate_reporting(self):
         if self.update_task.is_alive():
             log.info('progress reporting thread is already alive, not '
@@ -113,11 +118,6 @@ class CloneProgressReporter:
             return
 
         log.info('initiating progress reporting for clones...')
-        # progress event ID for ongoing clone jobs
-        self.on_pev_id: Optional[str] = 'mgr-vol-ongoing-clones'
-        # progress event ID for ongoing+pending clone jobs
-        self.onpen_pev_id: Optional[str] = 'mgr-vol-total-clones'
-
         self.update_task = RTimer(1, self._update_progress_bars)
         self.update_task.start()
         log.info('progress reporting for clones has been initiated')
@@ -294,10 +294,7 @@ class CloneProgressReporter:
         assert self.onpen_pev_id is not None
 
         self.volclient.mgr.remote('progress', 'complete', self.on_pev_id)
-        self.on_pev_id = None
-
         self.volclient.mgr.remote('progress', 'complete', self.onpen_pev_id)
-        self.onpen_pev_id = None
 
         log.info('finished removing progress bars from "ceph status" output')
 
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py
index 041f1ed3044..59ebbb6347e 100644
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -131,6 +131,10 @@ class DriveSelection(object):
         for disk in self.disks:
             logger.debug("Processing disk {}".format(disk.path))
 
+            if disk.being_replaced:
+                logger.debug('Ignoring disk {} as it is being replaced.'.format(disk.path))
+                continue
+
             if not disk.available and not disk.ceph_device:
                 logger.debug(
                     ("Ignoring disk {}. "
diff --git a/src/python-common/ceph/deployment/inventory.py b/src/python-common/ceph/deployment/inventory.py
index a3023882108..e2c1a5605f9 100644
--- a/src/python-common/ceph/deployment/inventory.py
+++ b/src/python-common/ceph/deployment/inventory.py
@@ -54,7 +54,8 @@ class Device(object):
         'human_readable_type',
         'device_id',
         'lsm_data',
-        'crush_device_class'
+        'crush_device_class',
+        'being_replaced'
     ]
 
     def __init__(self,
@@ -67,7 +68,8 @@ class Device(object):
                  lsm_data=None,  # type: Optional[Dict[str, Dict[str, str]]]
                  created=None,  # type: Optional[datetime.datetime]
                  ceph_device=None,  # type: Optional[bool]
-                 crush_device_class=None  # type: Optional[str]
+                 crush_device_class=None,  # type: Optional[str]
+                 being_replaced=None,  # type: Optional[bool]
                  ):
 
         self.path = path
@@ -80,6 +82,7 @@ class Device(object):
         self.created = created if created is not None else datetime_now()
         self.ceph_device = ceph_device
         self.crush_device_class = crush_device_class
+        self.being_replaced = being_replaced
 
     def __eq__(self, other):
         # type: (Any) -> bool
@@ -129,7 +132,8 @@ class Device(object):
             'lvs': self.lvs if self.lvs else 'None',
             'available': str(self.available),
             'ceph_device': str(self.ceph_device),
-            'crush_device_class': str(self.crush_device_class)
+            'crush_device_class': str(self.crush_device_class),
+            'being_replaced': str(self.being_replaced)
         }
         if not self.available and self.rejected_reasons:
             device_desc['rejection reasons'] = self.rejected_reasons
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index d1ef9f5ac95..979c14f7d00 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1340,6 +1340,8 @@ class NvmeofServiceSpec(ServiceSpec):
                  allowed_consecutive_spdk_ping_failures: Optional[int] = 1,
                  spdk_ping_interval_in_seconds: Optional[float] = 2.0,
                  ping_spdk_under_lock: Optional[bool] = False,
+                 max_hosts_per_namespace: Optional[int] = 1,
+                 max_namespaces_with_netmask: Optional[int] = 1000,
                  server_key: Optional[str] = None,
                  server_cert: Optional[str] = None,
                  client_key: Optional[str] = None,
@@ -1348,7 +1350,9 @@ class NvmeofServiceSpec(ServiceSpec):
                  spdk_path: Optional[str] = None,
                  tgt_path: Optional[str] = None,
                  spdk_timeout: Optional[float] = 60.0,
-                 spdk_log_level: Optional[str] = 'WARNING',
+                 spdk_log_level: Optional[str] = '',
+                 spdk_protocol_log_level: Optional[str] = 'WARNING',
+                 spdk_log_file_dir: Optional[str] = '',
                  rpc_socket_dir: Optional[str] = '/var/tmp/',
                  rpc_socket_name: Optional[str] = 'spdk.sock',
                  conn_retries: Optional[int] = 10,
@@ -1368,6 +1372,7 @@ class NvmeofServiceSpec(ServiceSpec):
                  log_directory: Optional[str] = '/var/log/ceph/',
                  monitor_timeout: Optional[float] = 1.0,
                  enable_monitor_client: bool = True,
+                 monitor_client_log_file_dir: Optional[str] = '',
                  placement: Optional[PlacementSpec] = None,
                  unmanaged: bool = False,
                  preview_only: bool = False,
@@ -1416,6 +1421,10 @@ class NvmeofServiceSpec(ServiceSpec):
         self.omap_file_lock_retry_sleep_interval = omap_file_lock_retry_sleep_interval
         #: ``omap_file_update_reloads`` number of attempt to reload OMAP when it differs from local
         self.omap_file_update_reloads = omap_file_update_reloads
+        #: ``max_hosts_per_namespace`` max number of hosts per namespace
+        self.max_hosts_per_namespace = max_hosts_per_namespace
+        #: ``max_namespaces_with_netmask`` max number of namespaces which are not auto visible
+        self.max_namespaces_with_netmask = max_namespaces_with_netmask
         #: ``allowed_consecutive_spdk_ping_failures`` # of ping failures before aborting gateway
         self.allowed_consecutive_spdk_ping_failures = allowed_consecutive_spdk_ping_failures
         #: ``spdk_ping_interval_in_seconds`` sleep interval in seconds between SPDK pings
@@ -1441,7 +1450,11 @@ class NvmeofServiceSpec(ServiceSpec):
         #: ``spdk_timeout`` SPDK connectivity timeout
         self.spdk_timeout = spdk_timeout
         #: ``spdk_log_level`` the SPDK log level
-        self.spdk_log_level = spdk_log_level or 'WARNING'
+        self.spdk_log_level = spdk_log_level
+        #: ``spdk_protocol_log_level`` the SPDK protocol log level
+        self.spdk_protocol_log_level = spdk_protocol_log_level or 'WARNING'
+        #: ``spdk_log_file_dir`` the SPDK log output file file directory
+        self.spdk_log_file_dir = spdk_log_file_dir
         #: ``rpc_socket_dir`` the SPDK RPC socket file directory
         self.rpc_socket_dir = rpc_socket_dir or '/var/tmp/'
         #: ``rpc_socket_name`` the SPDK RPC socket file name
@@ -1478,6 +1491,8 @@ class NvmeofServiceSpec(ServiceSpec):
         self.monitor_timeout = monitor_timeout
         #: ``enable_monitor_client`` whether to connect to the ceph monitor or not
         self.enable_monitor_client = enable_monitor_client
+        #: ``monitor_client_log_file_dir`` the monitor client log output file file directory
+        self.monitor_client_log_file_dir = monitor_client_log_file_dir
 
     def get_port_start(self) -> List[int]:
         return [5500, 4420, 8009]
@@ -1522,6 +1537,16 @@ class NvmeofServiceSpec(ServiceSpec):
                     'Invalid SPDK log level. Valid values are: '
                     'DEBUG, INFO, WARNING, ERROR, NOTICE')
 
+        if self.spdk_protocol_log_level:
+            if self.spdk_protocol_log_level.lower() not in ['debug',
+                                                            'info',
+                                                            'warning',
+                                                            'error',
+                                                            'notice']:
+                raise SpecValidationError(
+                    'Invalid SPDK protocol log level. Valid values are: '
+                    'DEBUG, INFO, WARNING, ERROR, NOTICE')
+
         if (
             self.spdk_ping_interval_in_seconds
             and self.spdk_ping_interval_in_seconds < 1.0
@@ -1762,7 +1787,7 @@ class IngressSpec(ServiceSpec):
         if not self.keepalive_only and not self.frontend_port:
             raise SpecValidationError(
                 'Cannot add ingress: No frontend_port specified')
-        if not self.monitor_port:
+        if not self.keepalive_only and not self.monitor_port:
             raise SpecValidationError(
                 'Cannot add ingress: No monitor_port specified')
         if not self.virtual_ip and not self.virtual_ips_list:
@@ -1804,6 +1829,8 @@ class MgmtGatewaySpec(ServiceSpec):
                  ssl_stapling_verify: Optional[str] = None,
                  ssl_protocols: Optional[List[str]] = None,
                  ssl_ciphers: Optional[List[str]] = None,
+                 enable_health_check_endpoint: bool = False,
+                 virtual_ip: Optional[str] = None,
                  preview_only: bool = False,
                  unmanaged: bool = False,
                  extra_container_args: Optional[GeneralArgList] = None,
@@ -1849,6 +1876,8 @@ class MgmtGatewaySpec(ServiceSpec):
         self.ssl_protocols = ssl_protocols
         #: List of supported secure SSL ciphers. Changing this list may reduce system security.
         self.ssl_ciphers = ssl_ciphers
+        self.enable_health_check_endpoint = enable_health_check_endpoint
+        self.virtual_ip = virtual_ip
 
     def get_port_start(self) -> List[int]:
         ports = []
diff --git a/src/python-common/ceph/fs/earmarking.py b/src/python-common/ceph/fs/earmarking.py
index 238f2d8755f..c5d4a59a4d5 100644
--- a/src/python-common/ceph/fs/earmarking.py
+++ b/src/python-common/ceph/fs/earmarking.py
@@ -61,11 +61,16 @@ class CephFSVolumeEarmarking:
         if isinstance(e, ValueError):
             raise EarmarkException(errno.EINVAL, f"Invalid earmark specified: {e}") from e
         elif isinstance(e, OSError):
-            log.error(f"Error {action} earmark: {e}")
-            raise EarmarkException(-e.errno, e.strerror) from e
+            if e.errno == errno.ENODATA:
+                # Return empty string when earmark is not set
+                log.info(f"No earmark set for the path while {action}. Returning empty result.")
+                return ''
+            else:
+                log.error(f"Error {action} earmark: {e}")
+                raise EarmarkException(-e.errno, e.strerror) from e
         else:
             log.error(f"Unexpected error {action} earmark: {e}")
-            raise EarmarkException
+            raise EarmarkException(errno.EFAULT, f"Unexpected error {action} earmark: {e}") from e
 
     @staticmethod
     def parse_earmark(value: str) -> Optional[EarmarkContents]:
@@ -128,8 +133,7 @@ class CephFSVolumeEarmarking:
             )
             return earmark_value
         except Exception as e:
-            self._handle_cephfs_error(e, "getting")
-            return None
+            return self._handle_cephfs_error(e, "getting")
 
     def set_earmark(self, earmark: str):
         # Validate the earmark before attempting to set it
diff --git a/src/python-common/ceph/utils.py b/src/python-common/ceph/utils.py
index e92a2d1de7d..0544e9f4173 100644
--- a/src/python-common/ceph/utils.py
+++ b/src/python-common/ceph/utils.py
@@ -167,3 +167,18 @@ def http_req(hostname: str = '',
         log.error(e)
         # handle error here if needed
         raise
+
+
+_TRUE_VALS = {'y', 'yes', 't', 'true', 'on', '1'}
+_FALSE_VALS = {'n', 'no', 'f', 'false', 'off', '0'}
+
+
+def strtobool(value: str) -> bool:
+    """Convert a string to a boolean value.
+    Based on a simlilar function once available at distutils.util.strtobool.
+    """
+    if value.lower() in _TRUE_VALS:
+        return True
+    if value.lower() in _FALSE_VALS:
+        return False
+    raise ValueError(f'invalid truth value {value!r}')
diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc
index cf6820a9111..a87d88c4b85 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.cc
+++ b/src/rgw/driver/daos/rgw_sal_daos.cc
@@ -1028,6 +1028,22 @@ int DaosObject::transition_to_cloud(
   return DAOS_NOT_IMPLEMENTED_LOG(dpp);
 }
 
+int DaosObject::restore_obj_from_cloud(Bucket* bucket,
+          rgw::sal::PlacementTier* tier,
+          rgw_placement_rule& placement_rule,
+          rgw_bucket_dir_entry& o,
+	  CephContext* cct,
+          RGWObjTier& tier_config,
+          real_time& mtime,
+          uint64_t olh_epoch,
+          std::optional<uint64_t> days,
+          const DoutPrefixProvider* dpp, 
+          optional_yield y,
+          uint32_t flags)
+{
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
 bool DaosObject::placement_rules_match(rgw_placement_rule& r1,
                                        rgw_placement_rule& r2) {
   /* XXX: support single default zone and zonegroup for now */
diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h
index 7cc20260227..e382fdb04ae 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.h
+++ b/src/rgw/driver/daos/rgw_sal_daos.h
@@ -649,6 +649,18 @@ class DaosObject : public StoreObject {
                                   CephContext* cct, bool update_object,
                                   const DoutPrefixProvider* dpp,
                                   optional_yield y) override;
+  virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+			   RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+			   std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+			   uint32_t flags) override;
   virtual bool placement_rules_match(rgw_placement_rule& r1,
                                      rgw_placement_rule& r2) override;
   virtual int dump_obj_layout(const DoutPrefixProvider* dpp, optional_yield y,
diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc
index 0ce02bcff13..1345468210f 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.cc
+++ b/src/rgw/driver/posix/rgw_sal_posix.cc
@@ -3039,6 +3039,22 @@ int POSIXObject::transition_to_cloud(Bucket* bucket,
   return -ERR_NOT_IMPLEMENTED;
 }
 
+int POSIXObject::restore_obj_from_cloud(Bucket* bucket,
+          rgw::sal::PlacementTier* tier,
+          rgw_placement_rule& placement_rule,
+          rgw_bucket_dir_entry& o,
+	  CephContext* cct,
+          RGWObjTier& tier_config,
+          real_time& mtime,
+          uint64_t olh_epoch,
+          std::optional<uint64_t> days,
+          const DoutPrefixProvider* dpp, 
+          optional_yield y,
+          uint32_t flags)
+{
+  return -ERR_NOT_IMPLEMENTED;
+}
+
 bool POSIXObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
 {
   return (r1 == r2);
diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h
index efe3bfd7a50..8ec72bbc1bc 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.h
+++ b/src/rgw/driver/posix/rgw_sal_posix.h
@@ -681,6 +681,18 @@ public:
 			 bool update_object,
 			 const DoutPrefixProvider* dpp,
 			 optional_yield y) override;
+  virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+		           RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+			   std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override;
   virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
   virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
   virtual int swift_versioning_restore(const ACLOwner& owner, const rgw_user& remote_user, bool& restored,
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
index 792671579b7..d5437f548c1 100644
--- a/src/rgw/driver/rados/rgw_data_sync.cc
+++ b/src/rgw/driver/rados/rgw_data_sync.cc
@@ -6052,12 +6052,13 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
               } else {
                 tn->log(20, SSTR("logged prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode));
 	      }
-	    }
+	    } else {
             retcode = -EAGAIN;
             tn->log(10, SSTR("ERROR: requested sync of future generation "
                              << *gen << " > " << current_gen
                              << ", returning " << retcode << " for later retry"));
             return set_cr_error(retcode);
+            }
           } else if (*gen < current_gen) {
             tn->log(10, SSTR("WARNING: requested sync of past generation "
                              << *gen << " < " << current_gen
diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h
index 58042df2c62..6cfaee9dc82 100644
--- a/src/rgw/driver/rados/rgw_datalog.h
+++ b/src/rgw/driver/rados/rgw_datalog.h
@@ -241,10 +241,7 @@ class RGWDataChangesLog {
   std::unique_ptr<DataLogBackends> bes;
 
   const int num_shards;
-  std::string get_prefix() {
-    auto prefix = cct->_conf->rgw_data_log_obj_prefix;
-    return prefix.empty() ? prefix : "data_log";
-  }
+  std::string get_prefix() { return "data_log"; }
   std::string metadata_log_oid() {
     return get_prefix() + "generations_metadata";
   }
diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc
index 64c55700eb2..b153a7b4a42 100644
--- a/src/rgw/driver/rados/rgw_lc_tier.cc
+++ b/src/rgw/driver/rados/rgw_lc_tier.cc
@@ -14,6 +14,7 @@
 #include "rgw_common.h"
 #include "rgw_rest.h"
 #include "svc_zone.h"
+#include "rgw_rados.h"
 
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string.hpp>
@@ -76,8 +77,9 @@ WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info)
 
 static inline string get_key_instance(const rgw_obj_key& key)
 {
-  if (!key.instance.empty() &&
-      !key.have_null_instance()) {
+  // if non-current entry, add versionID to the
+  // transitioned object name including "null".
+  if (!key.instance.empty()) {
     return "-" + key.instance;
   }
   return "";
@@ -231,18 +233,38 @@ static void init_headers(map<string, bufferlist>& attrs,
   }
 }
 
-/* Read object or just head from remote endpoint. For now initializes only headers,
- * but can be extended to fetch etag, mtime etc if needed.
+struct generic_attr {
+  const char *http_header;
+  const char *rgw_attr;
+};
+
+/*
+ * mapping between http env fields and rgw object attrs
+ */
+static const struct generic_attr generic_attrs[] = {
+  { "CONTENT_TYPE",             RGW_ATTR_CONTENT_TYPE },
+  { "HTTP_CONTENT_LANGUAGE",    RGW_ATTR_CONTENT_LANG },
+  { "HTTP_EXPIRES",             RGW_ATTR_EXPIRES },
+  { "HTTP_CACHE_CONTROL",       RGW_ATTR_CACHE_CONTROL },
+  { "HTTP_CONTENT_DISPOSITION", RGW_ATTR_CONTENT_DISP },
+  { "HTTP_CONTENT_ENCODING",    RGW_ATTR_CONTENT_ENC },
+  { "HTTP_X_ROBOTS_TAG",        RGW_ATTR_X_ROBOTS_TAG },
+  { "ETAG",                     RGW_ATTR_ETAG },
+};
+
+/* Read object or just head from remote endpoint.
  */
-static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
-                         std::map<std::string, std::string>& headers) {
+int rgw_cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+                         std::map<std::string, std::string>& headers,
+                         real_time* pset_mtime, std::string& etag,
+                         uint64_t& accounted_size, rgw::sal::Attrs& attrs,
+                         void* cb) {
   RGWRESTConn::get_obj_params req_params;
   std::string target_obj_name;
   int ret = 0;
   rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
         tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
         tier_ctx.target_storage_class);
-  std::string etag;
   RGWRESTStreamRWRequest *in_req;
 
   rgw_bucket dest_bucket;
@@ -261,20 +283,57 @@ static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
   req_params.rgwx_stat = true;
   req_params.sync_manifest = true;
   req_params.skip_decrypt = true;
+  req_params.cb = (RGWHTTPStreamRWRequest::ReceiveCB *)cb;
 
-  ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req);
-  if (ret < 0) {
-    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
-    return ret;
+  ldpp_dout(tier_ctx.dpp, 20) << __func__ << "(): fetching object from cloud bucket:" << dest_bucket << ", object: " << target_obj_name << dendl;
+
+  static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20;
+  for (int tries = 0; tries < NUM_ENPOINT_IOERROR_RETRIES; tries++) {
+    ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req);
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* fetch headers */
+    // accounted_size in complete_request() reads from RGWX_OBJECT_SIZE which is set
+    // only for internal ops/sync. So instead read from headers[CONTENT_LEN].
+    // Same goes for pattrs.
+    ret = tier_ctx.conn.complete_request(tier_ctx.dpp, in_req, &etag, pset_mtime, nullptr, nullptr, &headers, null_yield);
+    if (ret < 0) {
+      if (ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
+        ldpp_dout(tier_ctx.dpp, 20) << __func__  << "(): failed to fetch object from remote. retries=" << tries << dendl;
+        continue;
+      }
+      return ret;
+    }
+    break;
   }
 
-  /* fetch headers */
-  ret = tier_ctx.conn.complete_request(tier_ctx.dpp, in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
-  if (ret < 0 && ret != -ENOENT) {
-    ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
-    return ret;
+  static map<string, string> generic_attrs_map;
+  for (const auto& http2rgw : generic_attrs) {
+    generic_attrs_map[http2rgw.http_header] = http2rgw.rgw_attr;
   }
-  return 0;
+
+  for (auto header: headers) {
+    const char* name = header.first.c_str();
+    const string& val = header.second;
+    bufferlist bl;
+    bl.append(val.c_str(), val.size());
+
+    const auto aiter = generic_attrs_map.find(name);
+    if (aiter != std::end(generic_attrs_map)) {
+      ldpp_dout(tier_ctx.dpp, 20) << __func__ << " Received attrs aiter->first = " << aiter->first << ", aiter->second = " << aiter->second << ret << dendl;
+     attrs[aiter->second] = bl;
+    }
+    
+    if (header.first == "CONTENT_LENGTH") {
+      accounted_size = atoi(val.c_str());
+    }
+  }
+
+  ldpp_dout(tier_ctx.dpp, 20) << __func__ << "(): Sucessfully fetched object from cloud bucket:" << dest_bucket << ", object: " << target_obj_name << dendl;
+  return ret;
 }
 
 static bool is_already_tiered(const DoutPrefixProvider *dpp,
@@ -1184,9 +1243,12 @@ static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) {
 static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) {
   int ret;
   std::map<std::string, std::string> headers;
+  std::string etag;
+  uint64_t accounted_size;
+  rgw::sal::Attrs attrs;
 
   /* Fetch Head object */
-  ret = cloud_tier_get_object(tier_ctx, true, headers);
+  ret = rgw_cloud_tier_get_object(tier_ctx, true, headers, nullptr, etag, accounted_size, attrs, nullptr);
 
   if (ret < 0) {
     ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl;
diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h
index 729c4c304cd..fd8013eb000 100644
--- a/src/rgw/driver/rados/rgw_lc_tier.h
+++ b/src/rgw/driver/rados/rgw_lc_tier.h
@@ -49,3 +49,9 @@ struct RGWLCCloudTierCtx {
 
 /* Transition object to cloud endpoint */
 int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
+
+int rgw_cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+                         std::map<std::string, std::string>& headers,
+                         real_time* pset_mtime, std::string& etag,
+                         uint64_t& accounted_size, rgw::sal::Attrs& attrs,
+                         void* cb);
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
index a5d788ea469..09a544df805 100644
--- a/src/rgw/driver/rados/rgw_object_expirer_core.cc
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc
@@ -219,13 +219,9 @@ int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objex
   }
 
   rgw_obj_key key = hint.obj_key;
-  if (key.instance.empty()) {
-    key.instance = "null";
-  }
 
   std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
-  obj->set_atomic();
-  ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
+  ret = static_cast<rgw::sal::RadosObject*>(obj.get())->handle_obj_expiry(dpp, null_yield);
 
   return ret;
 }
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index e618c40cb90..a133b54dc59 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -23,6 +23,7 @@
 #include "common/BackTrace.h"
 #include "common/ceph_time.h"
 
+#include "rgw_asio_thread.h"
 #include "rgw_cksum.h"
 #include "rgw_sal.h"
 #include "rgw_zone.h"
@@ -36,6 +37,7 @@
 #include "rgw_cr_rest.h"
 #include "rgw_datalog.h"
 #include "rgw_putobj_processor.h"
+#include "rgw_lc_tier.h"
 
 #include "cls/rgw/cls_rgw_ops.h"
 #include "cls/rgw/cls_rgw_client.h"
@@ -3211,6 +3213,30 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
     op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
   }
 
+  /* For temporary restored copies, storage-class returned
+   * in GET/list-objects should correspond to original
+   * cloudtier storage class. For GET its handled in its REST
+   * response by verifying RESTORE_TYPE in attrs. But the same
+   * cannot be done for list-objects response and hence this
+   * needs to be updated in bi entry itself.
+   */
+  auto attr_iter = attrs.find(RGW_ATTR_RESTORE_TYPE);
+  if (attr_iter != attrs.end()) {
+    rgw::sal::RGWRestoreType rt;
+    bufferlist bl = attr_iter->second;
+    auto iter = bl.cbegin();
+    decode(rt, iter);
+
+    if (rt == rgw::sal::RGWRestoreType::Temporary) {
+      // temporary restore; set storage-class to cloudtier storage class
+      auto c_iter = attrs.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
+      if (c_iter != attrs.end()) {
+        storage_class = rgw_bl_str(c_iter->second);
+      }
+    }
+  }
+
   if (!op.size())
     return 0;
 
@@ -3247,7 +3273,7 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
   auto& ioctx = ref.ioctx;
 
   tracepoint(rgw_rados, operate_enter, req_id.c_str());
-  r = rgw_rados_operate(rctx.dpp, ref.ioctx, ref.obj.oid, &op, rctx.y, 0, &trace);
+  r = rgw_rados_operate(rctx.dpp, ref.ioctx, ref.obj.oid, &op, rctx.y, 0, &trace, &epoch);
   tracepoint(rgw_rados, operate_exit, req_id.c_str());
   if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
                 or -ENOENT if was removed, or -EEXIST if it did not exist
@@ -3259,7 +3285,6 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
     goto done_cancel;
   }
 
-  epoch = ioctx.get_last_version();
   poolid = ioctx.get_id();
 
   r = target->complete_atomic_modification(rctx.dpp, rctx.y);
@@ -5064,7 +5089,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
 
 int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
                              RGWBucketInfo& bucket_info,
-                             const rgw_obj& obj,
+                             rgw_obj obj,
                              const rgw_placement_rule& placement_rule,
                              const real_time& mtime,
                              uint64_t olh_epoch,
@@ -5095,6 +5120,11 @@ int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
     return -ECANCELED;
   }
 
+  // bi expects empty instance for the entries created when bucket versioning
+  // is not enabled or suspended.
+  if (obj.key.instance == "null") {
+    obj.key.instance.clear();
+  }
   attrs.erase(RGW_ATTR_ID_TAG);
   attrs.erase(RGW_ATTR_TAIL_TAG);
 
@@ -5126,6 +5156,199 @@ int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
   return 0;
 }
 
+int RGWRados::restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx,
+                             RGWObjectCtx& obj_ctx,
+                             RGWBucketInfo& dest_bucket_info,
+                             const rgw_obj& dest_obj,
+                             rgw_placement_rule& dest_placement,
+                             RGWObjTier& tier_config,
+                             real_time& mtime,
+                             uint64_t olh_epoch,
+                             std::optional<uint64_t> days,
+                             const DoutPrefixProvider *dpp,
+                             optional_yield y,
+                             bool log_op){
+
+  //XXX: read below from attrs .. check transition_obj()
+  ACLOwner owner;
+  rgw::sal::Attrs attrs;
+  const req_context rctx{dpp, y, nullptr};
+  int ret = 0;
+  bufferlist t, t_tier;
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
+  auto aio = rgw::make_throttle(cct->_conf->rgw_put_obj_min_window_size, y);
+  using namespace rgw::putobj;
+  jspan_context no_trace{false, false};
+  rgw::putobj::AtomicObjectProcessor processor(aio.get(), this, dest_bucket_info, nullptr,
+                                  owner, obj_ctx, dest_obj, olh_epoch, tag, dpp, y, no_trace);
+ 
+  void (*progress_cb)(off_t, void *) = NULL;
+  void *progress_data = NULL;
+  bool cb_processed = false;
+  RGWFetchObjFilter *filter;
+  RGWFetchObjFilter_Default source_filter;
+  if (!filter) {
+    filter = &source_filter;
+  }
+  boost::optional<RGWPutObj_Compress> compressor;
+  CompressorRef plugin;
+  RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
+                    [&](map<string, bufferlist> obj_attrs) {
+                      // XXX: do we need filter() like in fetch_remote_obj() cb
+                      dest_placement.inherit_from(dest_bucket_info.placement_rule);
+                      /* For now we always restore to STANDARD storage-class.
+                       * Later we will add support to take restore-target-storage-class
+                       * for permanent restore
+                       */
+                      dest_placement.storage_class = RGW_STORAGE_CLASS_STANDARD;
+
+                      processor.set_tail_placement(dest_placement);
+
+                      ret = processor.prepare(rctx.y);
+                      if (ret < 0) {
+                        return ret;
+                      }
+                      cb_processed = true;
+                      return 0;
+                    });
+
+  uint64_t accounted_size = 0;
+  string etag;
+  real_time set_mtime;
+  std::map<std::string, std::string> headers;
+  ldpp_dout(dpp, 20) << "Fetching from cloud, object:" << dest_obj << dendl;
+  ret = rgw_cloud_tier_get_object(tier_ctx, false,  headers,
+                                &set_mtime, etag, accounted_size,
+                                attrs, &cb);
+
+  if (ret < 0) { 
+    ldpp_dout(dpp, 20) << "Fetching from cloud failed, object:" << dest_obj << dendl;
+    return ret; 
+  }
+
+  if (!cb_processed) { 
+    ldpp_dout(dpp, 20) << "Callback not processed, object:" << dest_obj << dendl;
+    return -EIO; 
+  }
+
+  ret = cb.flush();
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (cb.get_data_len() != accounted_size) {
+    ret = -EIO;
+    ldpp_dout(dpp, -1) << "ERROR: object truncated during fetching, expected "
+        << accounted_size << " bytes but received " << cb.get_data_len() << dendl;
+    return ret;
+  }
+
+  {
+    bufferlist bl;
+    encode(rgw::sal::RGWRestoreStatus::CloudRestored, bl);
+    attrs[RGW_ATTR_RESTORE_STATUS] = std::move(bl);
+  }
+
+  ceph::real_time restore_time = real_clock::now();
+  {
+    char buf[32];
+    utime_t ut(restore_time);
+    snprintf(buf, sizeof(buf), "%lld.%09lld",
+          (long long)ut.sec(),
+          (long long)ut.nsec());
+    bufferlist bl;
+    bl.append(buf, 32);
+    encode(restore_time, bl);
+    attrs[RGW_ATTR_RESTORE_TIME] = std::move(bl);
+  }
+
+  real_time delete_at = real_time();
+  if (days) { //temp copy; do not change mtime and set expiry date
+    int expiry_days = days.value();
+    constexpr int32_t secs_in_a_day = 24 * 60 * 60;
+    ceph::real_time expiration_date ;
+
+    if (cct->_conf->rgw_restore_debug_interval > 0) {
+      expiration_date = restore_time + make_timespan(double(expiry_days)*cct->_conf->rgw_restore_debug_interval);
+      ldpp_dout(dpp, 20) << "Setting expiration time to rgw_restore_debug_interval: " << double(expiry_days)*cct->_conf->rgw_restore_debug_interval << ", days:" << expiry_days << dendl;
+    } else {
+        expiration_date = restore_time + make_timespan(double(expiry_days) * secs_in_a_day);
+    }
+    delete_at = expiration_date;
+
+    {
+      char buf[32];
+      utime_t ut(expiration_date);
+      snprintf(buf, sizeof(buf), "%lld.%09lld",
+            (long long)ut.sec(),
+            (long long)ut.nsec());
+      bufferlist bl;
+      bl.append(buf, 32);
+      encode(expiration_date, bl);
+      attrs[RGW_ATTR_RESTORE_EXPIRY_DATE] = std::move(bl);
+    }
+    {
+      bufferlist bl;
+      bl.clear();
+      using ceph::encode;
+      encode(rgw::sal::RGWRestoreType::Temporary, bl);
+      attrs[RGW_ATTR_RESTORE_TYPE] = std::move(bl);
+      ldpp_dout(dpp, 20) << "Temporary restore, object:" << dest_obj << dendl;
+    }
+    {
+      string sc = tier_ctx.storage_class;
+      bufferlist bl;
+      bl.append(sc.c_str(), sc.size());
+      attrs[RGW_ATTR_CLOUDTIER_STORAGE_CLASS] = std::move(bl);
+      ldpp_dout(dpp, 20) << "Setting RGW_ATTR_CLOUDTIER_STORAGE_CLASS: " << tier_ctx.storage_class << dendl;
+    }
+    //set same old mtime as that of transition time
+    set_mtime = mtime;
+
+    // set tier-config only for temp restored objects, as
+    // permanent copies will be treated as regular objects
+    {
+      t.append("cloud-s3");
+      encode(tier_config, t_tier);
+      attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
+      attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+    }
+
+  } else { // permanent restore
+    {
+      bufferlist bl;
+      bl.clear();
+      using ceph::encode;
+      encode(rgw::sal::RGWRestoreType::Permanent, bl);
+      attrs[RGW_ATTR_RESTORE_TYPE] = std::move(bl);
+      ldpp_dout(dpp, 20) << "Permanent restore, object:" << dest_obj << dendl;
+    }
+    //set mtime to now()
+    set_mtime = real_clock::now();
+  }
+
+  {
+    string sc = dest_placement.get_storage_class(); //"STANDARD";
+    bufferlist bl;
+    bl.append(sc.c_str(), sc.size());
+    attrs[RGW_ATTR_STORAGE_CLASS] = std::move(bl);
+  }
+
+  // XXX: handle COMPLETE_RETRY like in fetch_remote_obj
+  bool canceled = false;
+  rgw_zone_set zone_set{};
+  ret = processor.complete(accounted_size, etag, &mtime, set_mtime,
+                           attrs, rgw::cksum::no_cksum, delete_at , nullptr, nullptr, nullptr,
+                           (rgw_zone_set *)&zone_set, &canceled, rctx, log_op ? rgw::sal::FLAG_LOG_OP : 0);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // XXX: handle olh_epoch for versioned objects like in fetch_remote_obj
+  return ret; 
+}
+
 int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
 {
   constexpr uint NUM_ENTRIES = 1000u;
@@ -5230,6 +5453,7 @@ int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& ob
     }
 
    /* remove bucket index objects asynchronously by best effort */
+    maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
     (void) CLSRGWIssueBucketIndexClean(index_pool,
 				       bucket_objs,
 				       cct->_conf->rgw_bucket_index_max_aio)();
@@ -5444,6 +5668,7 @@ int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& b
     bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
   }
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   ret = CLSRGWIssueBucketCheck(index_pool, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
   if (ret < 0) {
     return ret;
@@ -5468,6 +5693,7 @@ int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo&
     return r;
   }
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   return CLSRGWIssueBucketRebuild(index_pool, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
 }
 
@@ -5619,6 +5845,8 @@ int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp,
       cpp_strerror(-r) << ")" << dendl;
     return r;
   }
+
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   r = CLSRGWIssueSetBucketResharding(index_pool, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
   if (r < 0) {
     ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
@@ -5870,7 +6098,8 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvi
   }
 
   auto& ioctx = ref.ioctx;
-  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+  version_t epoch = 0;
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y, 0, nullptr, &epoch);
 
   /* raced with another operation, object state is indeterminate */
   const bool need_invalidate = (r == -ECANCELED);
@@ -5882,7 +6111,7 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvi
       tombstone_entry entry{*state};
       obj_tombstone_cache->add(obj, entry);
     }
-    r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs, y, log_op);
+    r = index_op.complete_del(dpp, poolid, epoch, state->mtime, params.remove_objs, y, log_op);
 
     int ret = target->complete_atomic_modification(dpp, y);
     if (ret < 0) {
@@ -6603,7 +6832,8 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu
   struct timespec mtime_ts = real_clock::to_timespec(mtime);
   op.mtime2(&mtime_ts);
   auto& ioctx = ref.ioctx;
-  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+  version_t epoch = 0;
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y, 0, nullptr, &epoch);
   if (state) {
     if (r >= 0) {
       ACLOwner owner;
@@ -6634,11 +6864,29 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu
                  iter != state->attrset.end()) {
         storage_class = rgw_bl_str(iter->second);
       }
-      uint64_t epoch = ioctx.get_last_version();
       int64_t poolid = ioctx.get_id();
+
+      // Retain Object category as CloudTiered while restore is in
+      // progress or failed
+      RGWObjCategory category = RGWObjCategory::Main;
+      auto r_iter = attrs.find(RGW_ATTR_RESTORE_STATUS);
+      if (r_iter != attrs.end()) {
+        rgw::sal::RGWRestoreStatus st = rgw::sal::RGWRestoreStatus::None;
+        auto iter = r_iter->second.cbegin();
+
+        try {
+          using ceph::decode;
+          decode(st, iter);
+
+          if (st != rgw::sal::RGWRestoreStatus::CloudRestored) {
+            category = RGWObjCategory::CloudTiered;
+          }
+        } catch (buffer::error& err) {
+        }
+      }
       r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
                             mtime, etag, content_type, storage_class, owner,
-                            RGWObjCategory::Main, nullptr, y, nullptr, false, log_op);
+                            category, nullptr, y, nullptr, false, log_op);
     } else {
       int ret = index_op.cancel(dpp, nullptr, y, log_op);
       if (ret < 0) {
@@ -6803,6 +7051,13 @@ int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *
   RGWBucketInfo& bucket_info = source->get_bucket_info();
 
   if (params.part_num) {
+    map<string, bufferlist> src_attrset;
+    for (auto& iter : astate->attrset) {
+      if (boost::algorithm::starts_with(iter.first, RGW_ATTR_CRYPT_PREFIX)) {
+        ldpp_dout(dpp, 4) << "get src crypt attr: " << iter.first << dendl;
+        src_attrset[iter.first] = iter.second;
+      }
+    }
     int parts_count = 0;
     // use the manifest to redirect to the requested part number
     r = get_part_obj_state(dpp, y, store, bucket_info, &source->get_ctx(),
@@ -6825,6 +7080,13 @@ int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *
     } else {
       params.parts_count = parts_count;
     }
+
+    for (auto& iter : src_attrset) {
+      ldpp_dout(dpp, 4) << "copy crypt attr: " << iter.first << dendl;
+      if (astate->attrset.find(iter.first) == astate->attrset.end()) {
+        astate->attrset[iter.first] = std::move(iter.second);
+      }
+    }
   }
 
   state.obj = astate->obj;
@@ -8799,12 +9061,7 @@ int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
   }
 
   bufferlist outbl;
-  r = rgw_rados_operate(dpp, ref.ioctx, ref.obj.oid, &op, &outbl, y);
-
-  if (epoch) {
-    *epoch = ref.ioctx.get_last_version();
-  }
-
+  r = rgw_rados_operate(dpp, ref.ioctx, ref.obj.oid, &op, &outbl, y, 0, nullptr, epoch);
   if (r < 0)
     return r;
 
@@ -9526,6 +9783,7 @@ int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWB
   if (r < 0)
     return r;
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   return CLSRGWIssueSetTagTimeout(index_pool, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
 }
 
@@ -9655,8 +9913,15 @@ int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
     num_entries << " total entries" << dendl;
 
   auto& ioctx = index_pool;
+
+  // XXX: check_disk_state() relies on ioctx.get_last_version() but that
+  // returns 0 because CLSRGWIssueBucketList doesn't make any synchonous calls
+  rgw_bucket_entry_ver index_ver;
+  index_ver.pool = ioctx.get_id();
+
   std::map<int, rgw_cls_list_ret> shard_list_results;
   cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
 			    num_entries_per_shard,
 			    list_versions, shard_oids, shard_list_results,
@@ -9778,12 +10043,10 @@ int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
       /* there are uncommitted ops. We need to check the current
        * state, and if the tags are old we need to do clean-up as
        * well. */
-      librados::IoCtx sub_ctx;
-      sub_ctx.dup(ioctx);
       ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
 	" calling check_disk_state bucket=" << bucket_info.bucket <<
 	" entry=" << dirent.key << dendl_bitx;
-      r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
+      r = check_disk_state(dpp, bucket_info, index_ver, dirent, dirent,
 			   updates[tracker.oid_name], y);
       if (r < 0 && r != -ENOENT) {
 	ldpp_dout(dpp, 0) << __func__ <<
@@ -10005,6 +10268,9 @@ int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
     }
   }
 
+  rgw_bucket_entry_ver index_ver;
+  index_ver.pool = ioctx.get_id();
+
   uint32_t count = 0u;
   std::map<std::string, bufferlist> updates;
   rgw_obj_index_key last_added_entry;
@@ -10019,7 +10285,7 @@ int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
     cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
 			   num_entries,
                            list_versions, &result);
-    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y, 0, nullptr, &index_ver.epoch);
     if (r < 0) {
       ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
 	": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
@@ -10036,12 +10302,10 @@ int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
 	  force_check) {
 	/* there are uncommitted ops. We need to check the current state,
 	 * and if the tags are old we need to do cleanup as well. */
-	librados::IoCtx sub_ctx;
-	sub_ctx.dup(ioctx);
 	ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
 	  ": calling check_disk_state bucket=" << bucket_info.bucket <<
 	  " entry=" << dirent.key << dendl_bitx;
-	r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
+	r = check_disk_state(dpp, bucket_info, index_ver, dirent, dirent, updates[oid], y);
 	if (r < 0 && r != -ENOENT) {
 	  ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
 	    ": error in check_disk_state, r=" << r << dendl;
@@ -10273,8 +10537,8 @@ int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
 }
 
 int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
-                               librados::IoCtx io_ctx,
                                RGWBucketInfo& bucket_info,
+                               const rgw_bucket_entry_ver& index_ver,
                                rgw_bucket_dir_entry& list_state,
                                rgw_bucket_dir_entry& object,
                                bufferlist& suggested_updates,
@@ -10302,8 +10566,6 @@ int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
     ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
   }
 
-  io_ctx.locator_set_key(list_state.locator);
-
   RGWObjState *astate = NULL;
   RGWObjManifest *manifest = nullptr;
   RGWObjectCtx octx(this->driver);
@@ -10324,8 +10586,7 @@ int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
     }
 
     // encode a suggested removal of that key
-    list_state.ver.epoch = io_ctx.get_last_version();
-    list_state.ver.pool = io_ctx.get_id();
+    list_state.ver = index_ver;
     ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
     cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
     return -ENOENT;
@@ -10441,6 +10702,7 @@ int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo
     return r;
   }
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   r = CLSRGWIssueGetDirHeader(index_pool, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
   if (r < 0) {
     ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index f95b6654a93..b24823b60dc 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -43,6 +43,7 @@
 #include "rgw_tools.h"
 
 struct D3nDataCache;
+struct RGWLCCloudTierCtx;
 
 class RGWWatcher;
 class ACLOwner;
@@ -1233,13 +1234,25 @@ public:
 
   int transition_obj(RGWObjectCtx& obj_ctx,
                      RGWBucketInfo& bucket_info,
-                     const rgw_obj& obj,
+                     rgw_obj obj,
                      const rgw_placement_rule& placement_rule,
                      const real_time& mtime,
                      uint64_t olh_epoch,
                      const DoutPrefixProvider *dpp,
                      optional_yield y,
                      bool log_op = true);
+int restore_obj_from_cloud(RGWLCCloudTierCtx& tier_ctx,
+                             RGWObjectCtx& obj_ctx,
+                             RGWBucketInfo& dest_bucket_info,
+                             const rgw_obj& dest_obj,
+                             rgw_placement_rule& dest_placement,
+                             RGWObjTier& tier_config,
+                             real_time& mtime,
+                             uint64_t olh_epoch,
+                             std::optional<uint64_t> days,
+                             const DoutPrefixProvider *dpp,
+                             optional_yield y,
+                             bool log_op = true);
 
   int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
 
@@ -1642,8 +1655,8 @@ public:
    * will encode that info as a suggested update.)
    */
   int check_disk_state(const DoutPrefixProvider *dpp,
-                       librados::IoCtx io_ctx,
                        RGWBucketInfo& bucket_info,
+                       const rgw_bucket_entry_ver& index_ver,
                        rgw_bucket_dir_entry& list_state,
                        rgw_bucket_dir_entry& object,
                        bufferlist& suggested_updates,
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
index 91b3cc02648..11b86a25841 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.cc
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -55,6 +55,7 @@
 #include "rgw_rest_ratelimit.h"
 #include "rgw_rest_realm.h"
 #include "rgw_rest_user.h"
+#include "rgw_lc_tier.h"
 #include "services/svc_sys_obj.h"
 #include "services/svc_mdlog.h"
 #include "services/svc_cls.h"
@@ -2491,6 +2492,107 @@ int RadosObject::transition(Bucket* bucket,
                                            mtime, olh_epoch, dpp, y, flags & FLAG_LOG_OP);
 }
 
+int RadosObject::restore_obj_from_cloud(Bucket* bucket,
+                                  rgw::sal::PlacementTier* tier,
+                                  rgw_placement_rule& placement_rule,
+                            	  rgw_bucket_dir_entry& o,
+                          	  CephContext* cct,
+                                  RGWObjTier& tier_config,
+                                  real_time& mtime,
+                                  uint64_t olh_epoch,
+                                  std::optional<uint64_t> days,
+                                  const DoutPrefixProvider* dpp, 
+                                  optional_yield y,
+                                  uint32_t flags)
+{
+  /* init */
+  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+  string id = "cloudid";
+  string endpoint = rtier->get_rt().t.s3.endpoint;
+  RGWAccessKey key = rtier->get_rt().t.s3.key;
+  string region = rtier->get_rt().t.s3.region;
+  HostStyle host_style = rtier->get_rt().t.s3.host_style;
+  string bucket_name = rtier->get_rt().t.s3.target_path;
+  const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup();
+  int ret = 0;
+  string src_storage_class = o.meta.storage_class; // or take src_placement also as input
+
+  // fetch mtime of the object
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+  read_op->params.lastmod = &mtime;
+
+  ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Restoring object(" << o.key << "): read_op failed ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (bucket_name.empty()) {
+    bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() +
+                    "-cloud-bucket";
+    boost::algorithm::to_lower(bucket_name);
+  }
+  /* Create RGW REST connection */
+  S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style);
+
+  // save source cloudtier storage class
+  RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(),
+           this, conn, bucket_name,
+           rtier->get_rt().t.s3.target_storage_class);
+  tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings;
+  tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size;
+  tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold;
+  tier_ctx.storage_class = tier->get_storage_class();
+
+  ldpp_dout(dpp, 20) << "Restoring object(" << o.key << ") from the cloud endpoint(" << endpoint << ")" << dendl;
+
+  if (days && days == 0) {
+    ldpp_dout(dpp, 0) << "Days = 0 not valid; Not restoring object (" << o.key << ") from the cloud endpoint(" << endpoint << ")" << dendl;
+    return 0;
+  }
+
+  // Note: For non-versioned objects, below should have already been set by the callers-
+  // o.current should be false; this(obj)->instance should have version-id.
+
+  // set restore_status as RESTORE_ALREADY_IN_PROGRESS
+  ret = set_cloud_restore_status(dpp, y, RGWRestoreStatus::RestoreAlreadyInProgress);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << " Setting cloud restore status to RESTORE_ALREADY_IN_PROGRESS for the object(" << o.key << ") from the cloud endpoint(" << endpoint << ") failed, ret=" << ret << dendl;
+    return ret;
+  }
+
+  /* Restore object from the cloud endpoint.
+   * All restore related status and attrs are set as part of object download to
+   * avoid any races */
+  ret = store->getRados()->restore_obj_from_cloud(tier_ctx, *rados_ctx,
+                                bucket->get_info(), get_obj(), placement_rule,
+                                tier_config,
+                                mtime, olh_epoch, days, dpp, y, flags & FLAG_LOG_OP);
+
+  if (ret < 0) { //failed to restore
+    ldpp_dout(dpp, 0) << "Restoring object(" << o.key << ") from the cloud endpoint(" << endpoint << ") failed, ret=" << ret << dendl;
+    auto reset_ret = set_cloud_restore_status(dpp, y, RGWRestoreStatus::RestoreFailed);
+
+    rgw_placement_rule target_placement;
+    target_placement.inherit_from(tier_ctx.bucket_info.placement_rule);
+    target_placement.storage_class = tier->get_storage_class();
+
+    /* Reset HEAD object as CloudTiered */
+    reset_ret = write_cloud_tier(dpp, y, tier_ctx.o.versioned_epoch,
+			   tier, tier_ctx.is_multipart_upload,
+			   target_placement, tier_ctx.obj);
+
+    if (reset_ret < 0) {
+      ldpp_dout(dpp, 0) << " Reset to cloud_tier of object(" << o.key << ") from the cloud endpoint(" << endpoint << ") failed, ret=" << reset_ret << dendl;
+    }
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "Sucessfully restored object(" << o.key << ") from the cloud endpoint(" << endpoint << ")" << dendl;
+
+  return ret;
+}
+
 int RadosObject::transition_to_cloud(Bucket* bucket,
 			   rgw::sal::PlacementTier* tier,
 			   rgw_bucket_dir_entry& o,
@@ -2568,6 +2670,118 @@ int RadosObject::transition_to_cloud(Bucket* bucket,
   return ret;
 }
 
+int RadosObject::set_cloud_restore_status(const DoutPrefixProvider* dpp,
+				  optional_yield y,
+			          rgw::sal::RGWRestoreStatus restore_status)
+{
+  int ret = 0;
+  set_atomic();
+ 
+  bufferlist bl;
+  using ceph::encode;
+  encode(restore_status, bl);
+
+  ret = modify_obj_attrs(RGW_ATTR_RESTORE_STATUS, bl, y, dpp);
+
+  return ret;
+}
+
+/*
+ * If the object is restored temporarily and is expired, delete the data and
+ * reset the HEAD object as cloud-transitioned.
+ */
+int RadosObject::handle_obj_expiry(const DoutPrefixProvider* dpp, optional_yield y) {
+  int ret = 0;
+  real_time read_mtime;
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+  read_op->params.lastmod = &read_mtime;
+  ldpp_dout(dpp, 20) << "Entering handle_obj_expiry Obj:" << get_key() << dendl;
+
+  ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "handle_obj_expiry Obj:" << get_key() << 
+	    ", read_op failed ret=" << ret << dendl;
+    return ret;
+  }
+
+  set_atomic();
+  map<string, bufferlist> attrs = get_attrs();
+  RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+  RGWRados::Object::Write obj_op(&op_target);
+	Object* obj = (Object*)this;
+
+  bufferlist bl;
+  auto attr_iter = attrs.find(RGW_ATTR_RESTORE_TYPE);
+  if (attr_iter != attrs.end()) {
+    using ceph::decode;
+    rgw::sal::RGWRestoreType restore_type;
+    decode(restore_type, attr_iter->second);
+    if (restore_type == rgw::sal::RGWRestoreType::Temporary) {
+      ldpp_dout(dpp, 10) << "Expiring temporary restored Obj:" << get_key() << dendl;
+
+      attr_iter = attrs.find(RGW_ATTR_MANIFEST);
+      if (attr_iter != attrs.end()) {
+        RGWObjManifest m;
+        try {
+          using ceph::decode;
+          decode(m, attr_iter->second);
+          obj_op.meta.modify_tail = true;
+          obj_op.meta.flags = PUT_OBJ_CREATE;
+          obj_op.meta.category = RGWObjCategory::CloudTiered;
+          obj_op.meta.delete_at = real_time();
+          bufferlist blo;
+          obj_op.meta.data = &blo;
+          obj_op.meta.if_match = NULL;
+          obj_op.meta.if_nomatch = NULL;
+          obj_op.meta.user_data = NULL;
+          obj_op.meta.zones_trace = NULL;
+          obj_op.meta.set_mtime = read_mtime;
+
+          RGWObjManifest *pmanifest;
+          pmanifest = &m;
+
+	        Object* head_obj = (Object*)this;
+          RGWObjTier tier_config;
+          m.get_tier_config(&tier_config);
+	
+          rgw_placement_rule target_placement(pmanifest->get_head_placement_rule(), tier_config.name);
+
+          pmanifest->set_head(target_placement, head_obj->get_obj(), 0);
+          pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket);
+          pmanifest->set_obj_size(0);
+          obj_op.meta.manifest = pmanifest;
+
+          // erase restore attrs
+          attrs.erase(RGW_ATTR_RESTORE_STATUS);
+          attrs.erase(RGW_ATTR_RESTORE_TYPE);
+          attrs.erase(RGW_ATTR_RESTORE_TIME);
+          attrs.erase(RGW_ATTR_RESTORE_EXPIRY_DATE);
+          attrs.erase(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
+          bufferlist bl;
+          bl.append(tier_config.name);
+          attrs[RGW_ATTR_STORAGE_CLASS] = bl;
+
+          const req_context rctx{dpp, y, nullptr};
+          return obj_op.write_meta(0, 0, attrs, rctx, head_obj->get_trace());
+        } catch (const buffer::end_of_buffer&) {
+          // ignore empty manifest; it's not cloud-tiered
+        } catch (const std::exception& e) {
+        }
+      }
+      return 0;
+    }
+  }
+  // object is not restored/temporary; go for regular deletion
+  // ensure object is not overwritten and is really expired
+  if (is_expired()) {
+    ldpp_dout(dpp, 10) << "Deleting expired obj:" << get_key() << dendl;
+
+    ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
+  }
+
+  return ret;
+}
 int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
 				  optional_yield y,
 				  uint64_t olh_epoch,
@@ -2578,6 +2792,13 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
 {
   rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
   map<string, bufferlist> attrs = get_attrs();
+  rgw_obj_key& obj_key = get_key();
+  // bi expects empty instance for the entries created when bucket versioning
+  // is not enabled or suspended.
+  if (obj_key.instance == "null") {
+      obj_key.instance.clear();
+  }
+
   RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
   RGWRados::Object::Write obj_op(&op_target);
 
@@ -2592,7 +2813,6 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
   obj_op.meta.if_nomatch = NULL;
   obj_op.meta.user_data = NULL;
   obj_op.meta.zones_trace = NULL;
-  obj_op.meta.delete_at = real_time();
   obj_op.meta.olh_epoch = olh_epoch;
 
   RGWObjManifest *pmanifest;
@@ -2621,6 +2841,13 @@ int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
   attrs.erase(RGW_ATTR_ID_TAG);
   attrs.erase(RGW_ATTR_TAIL_TAG);
 
+  // erase restore attrs
+  attrs.erase(RGW_ATTR_RESTORE_STATUS);
+  attrs.erase(RGW_ATTR_RESTORE_TYPE);
+  attrs.erase(RGW_ATTR_RESTORE_TIME);
+  attrs.erase(RGW_ATTR_RESTORE_EXPIRY_DATE);
+  attrs.erase(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
   const req_context rctx{dpp, y, nullptr};
   return obj_op.write_meta(0, 0, attrs, rctx, head_obj->get_trace());
 }
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
index 0372c5882aa..be681c9f975 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.h
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -626,6 +626,18 @@ class RadosObject : public StoreObject {
 			   bool update_object,
 			   const DoutPrefixProvider* dpp,
 			   optional_yield y) override;
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+         		   RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+  		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override;
     virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
     virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
 
@@ -664,6 +676,10 @@ class RadosObject : public StoreObject {
 			   bool is_multipart_upload,
 			   rgw_placement_rule& target_placement,
 			   Object* head_obj);
+    int handle_obj_expiry(const DoutPrefixProvider* dpp, optional_yield y);
+    int set_cloud_restore_status(const DoutPrefixProvider* dpp,
+			         optional_yield y,
+		                 RGWRestoreStatus restore_status);
     RGWObjManifest* get_manifest() { return manifest; }
     RGWObjectCtx& get_ctx() { return *rados_ctx; }
 
diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc
index 0af353b866f..f5cd193d815 100644
--- a/src/rgw/driver/rados/rgw_tools.cc
+++ b/src/rgw/driver/rados/rgw_tools.cc
@@ -198,36 +198,52 @@ int rgw_delete_system_obj(const DoutPrefixProvider *dpp,
 
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectReadOperation *op, bufferlist* pbl,
-                      optional_yield y, int flags, const jspan_context* trace_info)
+                      optional_yield y, int flags, const jspan_context* trace_info,
+                      version_t* pver)
 {
   // given a yield_context, call async_operate() to yield the coroutine instead
   // of blocking
   if (y) {
     auto& yield = y.get_yield_context();
     boost::system::error_code ec;
-    auto bl = librados::async_operate(
+    auto [ver, bl] = librados::async_operate(
       yield, ioctx, oid, op, flags, trace_info, yield[ec]);
     if (pbl) {
       *pbl = std::move(bl);
     }
+    if (pver) {
+      *pver = ver;
+    }
     return -ec.value();
   }
   maybe_warn_about_blocking(dpp);
-  return ioctx.operate(oid, op, nullptr, flags);
+  int r = ioctx.operate(oid, op, nullptr, flags);
+  if (pver) {
+    *pver = ioctx.get_last_version();
+  }
+  return r;
 }
 
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectWriteOperation *op, optional_yield y,
-		      int flags, const jspan_context* trace_info)
+		      int flags, const jspan_context* trace_info, version_t* pver)
 {
   if (y) {
     auto& yield = y.get_yield_context();
     boost::system::error_code ec;
-    librados::async_operate(yield, ioctx, oid, op, flags, trace_info, yield[ec]);
+    version_t ver = librados::async_operate(yield, ioctx, oid, op, flags,
+                                            trace_info, yield[ec]);
+    if (pver) {
+      *pver = ver;
+    }
     return -ec.value();
   }
   maybe_warn_about_blocking(dpp);
-  return ioctx.operate(oid, op, flags, trace_info);
+  int r = ioctx.operate(oid, op, flags, trace_info);
+  if (pver) {
+    *pver = ioctx.get_last_version();
+  }
+  return r;
 }
 
 int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
@@ -237,8 +253,8 @@ int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, cons
   if (y) {
     auto& yield = y.get_yield_context();
     boost::system::error_code ec;
-    auto reply = librados::async_notify(yield, ioctx, oid,
-                                        bl, timeout_ms, yield[ec]);
+    auto [ver, reply] = librados::async_notify(yield, ioctx, oid,
+                                               bl, timeout_ms, yield[ec]);
     if (pbl) {
       *pbl = std::move(reply);
     }
diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h
index 257e513a9f7..016da256263 100644
--- a/src/rgw/driver/rados/rgw_tools.h
+++ b/src/rgw/driver/rados/rgw_tools.h
@@ -93,10 +93,12 @@ void rgw_filter_attrset(std::map<std::string, bufferlist>& unfiltered_attrset, c
 /// perform the rados operation, using the yield context when given
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectReadOperation *op, bufferlist* pbl,
-                      optional_yield y, int flags = 0, const jspan_context* trace_info = nullptr);
+                      optional_yield y, int flags = 0, const jspan_context* trace_info = nullptr,
+                      version_t* pver = nullptr);
 int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                       librados::ObjectWriteOperation *op, optional_yield y,
-		      int flags = 0, const jspan_context* trace_info = nullptr);
+		      int flags = 0, const jspan_context* trace_info = nullptr,
+                      version_t* pver = nullptr);
 int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
                      bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
                      optional_yield y);
diff --git a/src/rgw/driver/rados/rgw_zone.cc b/src/rgw/driver/rados/rgw_zone.cc
index f9de570aa54..7d5fe3bcb21 100644
--- a/src/rgw/driver/rados/rgw_zone.cc
+++ b/src/rgw/driver/rados/rgw_zone.cc
@@ -1355,6 +1355,20 @@ int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config)
       retain_head_object = false;
     }
   }
+  if (config.exists("allow_read_through")) {
+    string s = config["allow_read_through"];
+    if (s == "true") {
+      allow_read_through = true;
+    } else {
+      allow_read_through = false;
+    }
+  }
+  if (config.exists("read_through_restore_days")) {
+    r = conf_to_uint64(config, "read_through_restore_days", &read_through_restore_days);
+    if (r < 0) {
+      read_through_restore_days = DEFAULT_READ_THROUGH_RESTORE_DAYS;
+    }
+  }
 
   if (tier_type == "cloud-s3") {
     r = t.s3.update_params(config);
@@ -1368,6 +1382,12 @@ int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config)
   if (config.exists("retain_head_object")) {
     retain_head_object = false;
   }
+  if (config.exists("allow_read_through")) {
+    allow_read_through = false;
+  }
+  if (config.exists("read_through_restore_days")) {
+    read_through_restore_days = DEFAULT_READ_THROUGH_RESTORE_DAYS;
+  }
 
   if (tier_type == "cloud-s3") {
     t.s3.clear_params(config);
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index a8874195217..b00dfaa1ec5 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -11187,22 +11187,22 @@ next:
     }
 
     formatter->open_object_section("result");
-    formatter->open_array_section("topics");
-    do {
-      rgw_pubsub_topics result;
-      int ret = ps.get_topics(dpp(), next_token, max_entries,
-                              result, next_token, null_yield);
-      if (ret < 0 && ret != -ENOENT) {
-        cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
-        return -ret;
-      }
-      for (const auto& [_, topic] : result.topics) {
-        if (owner && *owner != topic.owner) {
-          continue;
+    rgw_pubsub_topics result;
+    if (rgw::all_zonegroups_support(*site, rgw::zone_features::notification_v2) &&
+        driver->stat_topics_v1(tenant, null_yield, dpp()) == -ENOENT) {
+      formatter->open_array_section("topics");
+      do {
+        int ret = ps.get_topics_v2(dpp(), next_token, max_entries,
+                                   result, next_token, null_yield);
+        if (ret < 0 && ret != -ENOENT) {
+          cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
         }
-        std::set<std::string> subscribed_buckets;
-        if (rgw::all_zonegroups_support(*site, rgw::zone_features::notification_v2) &&
-            driver->stat_topics_v1(tenant, null_yield, dpp()) == -ENOENT) {
+        for (const auto& [_, topic] : result.topics) {
+          if (owner && *owner != topic.owner) {
+            continue;
+          }
+          std::set<std::string> subscribed_buckets;
           ret = driver->get_bucket_topic_mapping(topic, subscribed_buckets,
                                                  null_yield, dpp());
           if (ret < 0) {
@@ -11210,15 +11210,21 @@ next:
                  << topic.name << ", ret=" << ret << std::endl;
           }
           show_topics_info_v2(topic, subscribed_buckets, formatter.get());
-        } else {
-          encode_json("result", result, formatter.get());
-        }
-        if (max_entries_specified) {
-          --max_entries;
+          if (max_entries_specified) {
+            --max_entries;
+          }
         }
+        result.topics.clear();
+      } while (!next_token.empty() && max_entries > 0);
+      formatter->close_section(); // topics
+    } else { // v1, list all topics
+      int ret = ps.get_topics_v1(dpp(), result, null_yield);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
       }
-    } while (!next_token.empty() && max_entries > 0);
-    formatter->close_section(); // topics
+      encode_json("result", result, formatter.get());
+    }
     if (max_entries_specified) {
       encode_json("truncated", !next_token.empty(), formatter.get());
       if (!next_token.empty()) {
diff --git a/src/rgw/rgw_aio.cc b/src/rgw/rgw_aio.cc
index 7fba58ad63f..d2e56c57298 100644
--- a/src/rgw/rgw_aio.cc
+++ b/src/rgw/rgw_aio.cc
@@ -76,12 +76,12 @@ struct Handler {
   librados::IoCtx ctx;
   AioResult& r;
   // write callback
-  void operator()(boost::system::error_code ec) const {
+  void operator()(boost::system::error_code ec, version_t) const {
     r.result = -ec.value();
     throttle->put(r);
   }
   // read callback
-  void operator()(boost::system::error_code ec, bufferlist bl) const {
+  void operator()(boost::system::error_code ec, version_t, bufferlist bl) const {
     r.result = -ec.value();
     r.data = std::move(bl);
     throttle->put(r);
diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc
index f80e40c70b6..30e1e77fd15 100644
--- a/src/rgw/rgw_asio_frontend.cc
+++ b/src/rgw/rgw_asio_frontend.cc
@@ -1156,6 +1156,20 @@ void AsioFrontend::stop()
     // signal cancellation of accept()
     listener.signal.emit(boost::asio::cancellation_type::terminal);
   }
+
+  const bool graceful_stop{ g_ceph_context->_conf->rgw_graceful_stop };
+  if (graceful_stop) {
+    ldout(ctx(), 4) << "frontend pausing and waiting for outstanding requests to complete..." << dendl;
+    pause_mutex.lock(ec);
+    if (ec) {
+      ldout(ctx(), 1) << "frontend failed to pause: " << ec.message() << dendl;
+    } else {
+      ldout(ctx(), 4) << "frontend paused" << dendl;
+    }
+    ldout(ctx(), 4) << "frontend outstanding requests have completed" << dendl;
+    pause_mutex.unlock();
+  }
+
   // close all connections
   connections.close(ec);
   pause_mutex.cancel();
diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc
index 290b9bb46b3..ac1ed8b75d6 100644
--- a/src/rgw/rgw_auth.cc
+++ b/src/rgw/rgw_auth.cc
@@ -505,12 +505,12 @@ rgw::auth::Strategy::apply(const DoutPrefixProvider *dpp, const rgw::auth::Strat
       ldpp_dout(dpp, 5) << "Failed the auth strategy, reason="
                        << result.get_reason() << dendl;
       // Special handling for expired pre-signed URL
-      if (result.get_reason() == ERR_PRESIGNED_URL_EXPIRED) {
+      if (result.get_reason() == -ERR_PRESIGNED_URL_EXPIRED) {
         result = result_t::deny(-EPERM);
         set_req_state_err(s, -EPERM, "The pre-signed URL has expired");
       }
       // Special handling for disabled presigned URL
-      if (result.get_reason() == ERR_PRESIGNED_URL_DISABLED) {
+      if (result.get_reason() == -ERR_PRESIGNED_URL_DISABLED) {
         result = result_t::deny(-EPERM);
         set_req_state_err(s, -EPERM, "Presigned URLs are disabled by admin");
       }
diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc
index 412f4bf759a..4fe1e39d0a8 100644
--- a/src/rgw/rgw_auth_s3.cc
+++ b/src/rgw/rgw_auth_s3.cc
@@ -191,6 +191,7 @@ static inline void get_v2_qs_map(const req_info& info,
  * compute a request's signature
  */
 bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
+                                    RGWOpType op_type,
                                     const req_info& info,
                                     utime_t* const header_time,
                                     std::string& dest,
@@ -253,7 +254,8 @@ bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
     request_uri = info.effective_uri;
   }
 
-  rgw_create_s3_canonical_header(dpp, info.method, content_md5, content_type,
+  auto method = rgw::auth::s3::get_canonical_method(dpp, op_type, info);
+  rgw_create_s3_canonical_header(dpp, method.c_str(), content_md5, content_type,
                                  date.c_str(), meta_map, qs_map,
 				 request_uri.c_str(), sub_resources, dest);
   return true;
@@ -704,35 +706,6 @@ std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op)
   return canonical_qs;
 }
 
-std::string get_v4_canonical_method(const req_state* s)
-{
-  /* If this is a OPTIONS request we need to compute the v4 signature for the
-   * intended HTTP method and not the OPTIONS request itself. */
-  if (s->op_type == RGW_OP_OPTIONS_CORS) {
-    const char *cors_method = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
-
-    if (cors_method) {
-      /* Validate request method passed in access-control-request-method is valid. */
-      auto cors_flags = get_cors_method_flags(cors_method);
-      if (!cors_flags) {
-          ldpp_dout(s, 1) << "invalid access-control-request-method header = "
-                          << cors_method << dendl;
-          throw -EINVAL;
-      }
-
-      ldpp_dout(s, 10) << "canonical req method = " << cors_method
-                       << ", due to access-control-request-method header" << dendl;
-      return cors_method;
-    } else {
-      ldpp_dout(s, 1) << "invalid http options req missing "
-                      << "access-control-request-method header" << dendl;
-      throw -EINVAL;
-    }
-  }
-
-  return s->info.method;
-}
-
 boost::optional<std::string>
 get_v4_canonical_headers(const req_info& info,
                          const std::string_view& signedheaders,
@@ -1740,4 +1713,32 @@ AWSv4ComplSingle::create(const req_state* const s,
   return std::make_shared<AWSv4ComplSingle>(s);
 }
 
+std::string get_canonical_method(const DoutPrefixProvider *dpp, RGWOpType op_type, const req_info& info)
+{
+  /* If this is a OPTIONS request we need to compute the v4 signature for the
+   * intended HTTP method and not the OPTIONS request itself. */
+  if (op_type == RGW_OP_OPTIONS_CORS) {
+    const char *cors_method = info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
+
+    if (cors_method) {
+      /* Validate request method passed in access-control-request-method is valid. */
+      auto cors_flags = get_cors_method_flags(cors_method);
+      if (!cors_flags) {
+          ldpp_dout(dpp, 1) << "invalid access-control-request-method header = "
+                          << cors_method << dendl;
+          throw -EINVAL;
+      }
+
+      ldpp_dout(dpp, 10) << "canonical req method = " << cors_method
+                       << ", due to access-control-request-method header" << dendl;
+      return cors_method;
+    } else {
+      ldpp_dout(dpp, 1) << "invalid http options req missing "
+                      << "access-control-request-method header" << dendl;
+      throw -EINVAL;
+    }
+  }
+
+  return info.method;
+}
 } // namespace rgw::auth::s3
diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h
index e1fe5163f02..2f7fd2d7598 100644
--- a/src/rgw/rgw_auth_s3.h
+++ b/src/rgw/rgw_auth_s3.h
@@ -500,16 +500,17 @@ void rgw_create_s3_canonical_header(
   const std::map<std::string, std::string>& sub_resources,
   std::string& dest_str);
 bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
+                                    RGWOpType op_type,
                                     const req_info& info,
                                     utime_t *header_time,       /* out */
                                     std::string& dest,          /* out */
                                     bool qsr);
 static inline std::tuple<bool, std::string, utime_t>
-rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, const req_info& info, const bool qsr) {
+rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, RGWOpType op_type, const req_info& info, const bool qsr) {
   std::string dest;
   utime_t header_time;
 
-  const bool ok = rgw_create_s3_canonical_header(dpp, info, &header_time, dest, qsr);
+  const bool ok = rgw_create_s3_canonical_header(dpp, op_type, info, &header_time, dest, qsr);
   return std::make_tuple(ok, dest, header_time);
 }
 
@@ -704,8 +705,6 @@ std::string get_v4_canonical_qs(const req_info& info, bool using_qs);
 
 std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op);
 
-std::string get_v4_canonical_method(const req_state* s);
-
 boost::optional<std::string>
 get_v4_canonical_headers(const req_info& info,
                          const std::string_view& signedheaders,
@@ -745,6 +744,8 @@ extern AWSEngine::VersionAbstractor::server_signature_t
 get_v2_signature(CephContext*,
                  const std::string& secret_key,
                  const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign);
+
+std::string get_canonical_method(const DoutPrefixProvider *dpp, RGWOpType op_type, const req_info& info);
 } /* namespace s3 */
 } /* namespace auth */
 } /* namespace rgw */
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index b9e969a06fa..a8f6a1107a9 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -118,6 +118,12 @@ using ceph::crypto::MD5;
 #define RGW_ATTR_SOURCE_ZONE    RGW_ATTR_PREFIX "source_zone"
 #define RGW_ATTR_TAGS           RGW_ATTR_PREFIX RGW_AMZ_PREFIX "tagging"
 
+#define RGW_ATTR_CLOUDTIER_STORAGE_CLASS  RGW_ATTR_PREFIX "cloudtier_storage_class"
+#define RGW_ATTR_RESTORE_STATUS   RGW_ATTR_PREFIX "restore-status"
+#define RGW_ATTR_RESTORE_TYPE   RGW_ATTR_PREFIX "restore-type"
+#define RGW_ATTR_RESTORE_TIME   RGW_ATTR_PREFIX "restored-at"
+#define RGW_ATTR_RESTORE_EXPIRY_DATE   RGW_ATTR_PREFIX "restore-expiry-date"
+
 #define RGW_ATTR_TEMPURL_KEY1   RGW_ATTR_META_PREFIX "temp-url-key"
 #define RGW_ATTR_TEMPURL_KEY2   RGW_ATTR_META_PREFIX "temp-url-key-2"
 
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
index 0c80ad1b765..a7f2ceabad3 100644
--- a/src/rgw/rgw_lc.cc
+++ b/src/rgw/rgw_lc.cc
@@ -495,6 +495,14 @@ struct lc_op_ctx {
       octx(env.driver), dpp(dpp), wq(wq)
     {
       obj = bucket->get_object(o.key);
+      /* once bucket versioning is enabled, the non-current entries with
+       * instance empty should have instance set to "null" to be able
+       * to correctly read its olh version entry.
+       */
+      if (o.key.instance.empty() && bucket->versioned() && !o.is_current()) {
+        rgw_obj_key& obj_key = obj->get_key();
+        obj_key.instance = "null";
+      }
     }
 
   bool next_has_same_name(const std::string& key_name) {
@@ -1355,9 +1363,9 @@ public:
   int delete_tier_obj(lc_op_ctx& oc) {
     int ret = 0;
 
-    /* If bucket is versioned, create delete_marker for current version
+    /* If bucket has versioning enabled, create delete_marker for current version
      */
-    if (! oc.bucket->versioned()) {
+    if (! oc.bucket->versioning_enabled()) {
       ret =
           remove_expired_obj(oc.dpp, oc, true, {rgw::notify::ObjectTransition});
       ldpp_dout(oc.dpp, 20) << "delete_tier_obj Object(key:" << oc.o.key
@@ -1387,9 +1395,10 @@ public:
 
   int transition_obj_to_cloud(lc_op_ctx& oc) {
     int ret{0};
-    /* If CurrentVersion object, remove it & create delete marker */
+    /* If CurrentVersion object & bucket has versioning enabled, remove it &
+     * create delete marker */
     bool delete_object = (!oc.tier->retain_head_object() ||
-                     (oc.o.is_current() && oc.bucket->versioned()));
+                     (oc.o.is_current() && oc.bucket->versioning_enabled()));
 
     /* notifications */
     auto& bucket = oc.bucket;
@@ -1991,6 +2000,12 @@ int RGWLC::process(LCWorker* worker,
     }
   }
 
+  ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->process_expire_objects(this, null_yield);
+  if (ret < 0) {
+    ldpp_dout(this, 5) << "RGWLC::process_expire_objects: failed, "
+	          << " worker ix: " << worker->ix << dendl;
+  }
+
   return 0;
 }
 
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h
index b4c6ad4a86b..cc6a7e51a1d 100644
--- a/src/rgw/rgw_lc.h
+++ b/src/rgw/rgw_lc.h
@@ -469,7 +469,7 @@ struct transition_action
   int days;
   boost::optional<ceph::real_time> date;
   std::string storage_class;
-  transition_action() : days(0) {}
+  transition_action() : days(-1) {}
   void dump(Formatter *f) const {
     if (!date) {
       f->dump_int("days", days);
diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h
index 7b8d12599f4..2973a753fff 100644
--- a/src/rgw/rgw_lua_background.h
+++ b/src/rgw/rgw_lua_background.h
@@ -153,9 +153,8 @@ private:
 
   void run();
 
-protected:
   std::string rgw_script;
-  virtual int read_script();
+  int read_script();
 
 public:
   Background(rgw::sal::Driver* _driver,
@@ -173,7 +172,7 @@ public:
     std::unique_lock cond_lock(table_mutex);
     rgw_map[key] = value;
   }
-   
+
   // update the manager after 
   void set_manager(rgw::sal::LuaManager* _lua_manager);
   void pause() override;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index b54805bdc7d..67829e6320a 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -941,37 +941,131 @@ void handle_replication_status_header(
 }
 
 /*
- * GET on CloudTiered objects is processed only when sent from the sync client.
- * In all other cases, fail with `ERR_INVALID_OBJECT_STATE`.
+ * GET on CloudTiered objects either it will synced to other zones.
+ * In all other cases, it will try to fetch the object from remote cloud endpoint.
  */
-int handle_cloudtier_obj(rgw::sal::Attrs& attrs, bool sync_cloudtiered) {
+int handle_cloudtier_obj(req_state* s, const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                         rgw::sal::Attrs& attrs, bool sync_cloudtiered, std::optional<uint64_t> days,
+                         bool restore_op, optional_yield y)
+{
   int op_ret = 0;
+  ldpp_dout(dpp, 20) << "reached handle cloud tier " << dendl;
   auto attr_iter = attrs.find(RGW_ATTR_MANIFEST);
-  if (attr_iter != attrs.end()) {
-    RGWObjManifest m;
-    try {
-      decode(m, attr_iter->second);
-      if (m.get_tier_type() == "cloud-s3") {
-        if (!sync_cloudtiered) {
-          /* XXX: Instead send presigned redirect or read-through */
+  if (attr_iter == attrs.end()) {
+    if (restore_op) {
+      op_ret = -ERR_INVALID_OBJECT_STATE;
+      s->err.message = "only cloud tier object can be restored";
+      return op_ret;
+    } else { //ignore for read-through
+      return 0;
+    }
+  }
+  RGWObjManifest m;
+  try { 
+    decode(m, attr_iter->second);
+    if (m.get_tier_type() != "cloud-s3") {
+      ldpp_dout(dpp, 20) << "not a cloud tier object " <<  s->object->get_key().name << dendl;
+      if (restore_op) {
+        op_ret = -ERR_INVALID_OBJECT_STATE;
+        s->err.message = "only cloud tier object can be restored";
+        return op_ret;
+      } else { //ignore for read-through
+        return 0;
+      }
+    }
+    RGWObjTier tier_config;
+    m.get_tier_config(&tier_config);
+    if (sync_cloudtiered) {
+      bufferlist t, t_tier;
+      t.append("cloud-s3");
+      attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
+      encode(tier_config, t_tier);
+      attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+      return op_ret;
+    }
+    attr_iter = attrs.find(RGW_ATTR_RESTORE_STATUS);
+    rgw::sal::RGWRestoreStatus restore_status = rgw::sal::RGWRestoreStatus::None;
+    if (attr_iter != attrs.end()) {
+      bufferlist bl = attr_iter->second;
+      auto iter = bl.cbegin();
+      decode(restore_status, iter);
+    }
+    if (attr_iter == attrs.end() || restore_status == rgw::sal::RGWRestoreStatus::RestoreFailed) {
+      // first time restore or previous restore failed
+      rgw::sal::Bucket* pbucket = NULL;
+      pbucket = s->bucket.get();
+
+      std::unique_ptr<rgw::sal::PlacementTier> tier;
+      rgw_placement_rule target_placement;
+      target_placement.inherit_from(pbucket->get_placement_rule());
+      attr_iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
+      if (attr_iter != attrs.end()) {
+        target_placement.storage_class = attr_iter->second.to_str();
+      }
+      op_ret = driver->get_zone()->get_zonegroup().get_placement_tier(target_placement, &tier);
+      ldpp_dout(dpp, 20) << "getting tier placement handle cloud tier" << op_ret <<
+                       " storage class " << target_placement.storage_class << dendl;
+      if (op_ret < 0) {
+        s->err.message = "failed to restore object";
+        return op_ret;
+      }
+      rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier.get());
+      tier_config.tier_placement = rtier->get_rt();
+      if (!restore_op) {
+        if (tier_config.tier_placement.allow_read_through) {
+          days = tier_config.tier_placement.read_through_restore_days;
+        } else { //read-through is not enabled
           op_ret = -ERR_INVALID_OBJECT_STATE;
-        } else { // fetch object for sync and set cloud_tier attrs
-          bufferlist t, t_tier;
-          RGWObjTier tier_config;
-          m.get_tier_config(&tier_config);
-
-          t.append("cloud-s3");
-          attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
-          encode(tier_config, t_tier);
-          attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+          s->err.message = "Read through is not enabled for this config";
+          return op_ret;
         }
       }
-    } catch (const buffer::end_of_buffer&) {
-      // ignore empty manifest; it's not cloud-tiered
-    } catch (const std::exception& e) {
+      // fill in the entry. XXX: Maybe we can avoid it by passing only necessary params
+      rgw_bucket_dir_entry ent;
+      ent.key.name = s->object->get_key().name;
+      ent.meta.accounted_size = ent.meta.size = s->obj_size;
+      ent.meta.etag = "" ;
+      ceph::real_time mtime = s->object->get_mtime();
+      uint64_t epoch = 0;
+      op_ret = get_system_versioning_params(s, &epoch, NULL);
+      ldpp_dout(dpp, 20) << "getting versioning params tier placement handle cloud tier" << op_ret << dendl;
+      if (op_ret < 0) {
+	ldpp_dout(dpp, 20) << "failed to get versioning params, op_ret = " << op_ret << dendl;
+        s->err.message = "failed to restore object";
+        return op_ret;
+      }
+      op_ret = s->object->restore_obj_from_cloud(pbucket, tier.get(), target_placement, ent, s->cct, tier_config,
+                                                   mtime, epoch, days, dpp, y, s->bucket->get_info().flags);
+      if (op_ret < 0) {
+        ldpp_dout(dpp, 0) << "object " << ent.key.name << " fetching failed" << op_ret << dendl;
+        s->err.message = "failed to restore object";
+        return op_ret;
+      }
+      ldpp_dout(dpp, 20) << "object " << ent.key.name << " fetching succeed" << dendl;
+      /*  Even if restore is complete the first read through request will return but actually downloaded
+       * object asyncronously.
+       */
+      if (!restore_op) { //read-through
+        op_ret = -ERR_REQUEST_TIMEOUT;
+        ldpp_dout(dpp, 5) << "restore is still in progress, please check restore status and retry" << dendl;
+        s->err.message = "restore is still in progress";
+      }
+      return op_ret;
+    } else if ((!restore_op) && (restore_status == rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress)) {
+      op_ret = -ERR_REQUEST_TIMEOUT;
+      ldpp_dout(dpp, 5) << "restore is still in progress, please check restore status and retry" << dendl;
+      s->err.message = "restore is still in progress";
+    } else { // CloudRestored..return success
+      return 0;
     }
+  } catch (const buffer::end_of_buffer&) {
+    //empty manifest; it's not cloud-tiered
+    if (restore_op) {
+      op_ret = -ERR_INVALID_OBJECT_STATE;
+      s->err.message = "only cloud tier object can be restored";
+    }
+  } catch (const std::exception& e) {
   }
-
   return op_ret;
 }
 
@@ -2366,15 +2460,12 @@ void RGWGetObj::execute(optional_yield y)
     } catch (const buffer::error&) {}
   }
 
-
   if (get_type() == RGW_OP_GET_OBJ && get_data) {
-    op_ret = handle_cloudtier_obj(attrs, sync_cloudtiered);
+    std::optional<uint64_t> days;
+    op_ret = handle_cloudtier_obj(s, this, driver, attrs, sync_cloudtiered, days, false, y);
     if (op_ret < 0) {
       ldpp_dout(this, 4) << "Cannot get cloud tiered object: " << *s->object
-          <<". Failing with " << op_ret << dendl;
-      if (op_ret == -ERR_INVALID_OBJECT_STATE) {
-        s->err.message = "This object was transitioned to cloud-s3";
-      }
+                       <<". Failing with " << op_ret << dendl;
       goto done_err;
     }
   }
@@ -5155,6 +5246,73 @@ void RGWPutMetadataObject::execute(optional_yield y)
   op_ret = s->object->set_obj_attrs(this, &attrs, &rmattrs, s->yield, rgw::sal::FLAG_LOG_OP);
 }
 
+int RGWRestoreObj::init_processing(optional_yield y)
+{
+  int op_ret = get_params(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  return RGWOp::init_processing(y);
+}
+
+int RGWRestoreObj::verify_permission(optional_yield y)
+{
+  if (!verify_bucket_permission(this, s, ARN(s->object->get_obj()),
+                                rgw::IAM::s3RestoreObject)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWRestoreObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWRestoreObj::execute(optional_yield y)
+{
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+  
+  s->object->set_atomic();
+  int op_ret = s->object->get_obj_attrs(y, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to fetch get_obj_attrs op ret = " << op_ret << dendl;
+    return;
+  }
+  rgw::sal::Attrs attrs = s->object->get_attrs();
+  auto attr_iter = attrs.find(RGW_ATTR_MANIFEST);
+  if (attr_iter != attrs.end()) {
+    RGWObjManifest m;
+    decode(m, attr_iter->second);
+    RGWObjTier tier_config;
+    m.get_tier_config(&tier_config);
+    if (m.get_tier_type() == "cloud-s3") {
+      ldpp_dout(this, 20) << "execute: expiry days" << expiry_days <<dendl;
+      op_ret = handle_cloudtier_obj(s, this, driver, attrs, false, expiry_days, true, y);
+      if (op_ret < 0) {
+        ldpp_dout(this, 4) << "Cannot get cloud tiered object: " << *s->object
+        <<". Failing with " << op_ret << dendl;
+        if (op_ret == -ERR_INVALID_OBJECT_STATE) {
+          s->err.message = "This object was transitioned to cloud-s3";
+        }
+      }
+    } else {
+      ldpp_dout(this, 20) << "not cloud tier object erroring" << dendl;
+      op_ret = -ERR_INVALID_OBJECT_STATE;
+    }
+  } else {
+    ldpp_dout(this, 20) << " manifest not found" << dendl;
+  }
+  ldpp_dout(this, 20) << "completed restore" << dendl;
+
+  return;
+} 
+
 int RGWDeleteObj::handle_slo_manifest(bufferlist& bl, optional_yield y)
 {
   RGWSLOInfo slo_info;
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 47a4c3da609..df05500a437 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -1461,6 +1461,24 @@ public:
   virtual bool need_object_expiration() { return false; }
 };
 
+class RGWRestoreObj : public RGWOp {
+protected:
+  std::optional<uint64_t> expiry_days;
+public:
+  RGWRestoreObj() {}
+
+  int init_processing(optional_yield y) override;
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual int get_params(optional_yield y) {return 0;}
+
+  void send_response() override = 0;
+  const char* name() const override { return "restore_obj"; }
+  RGWOpType get_type() override { return RGW_OP_RESTORE_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
 class RGWDeleteObj : public RGWOp {
 protected:
   bool delete_marker;
diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h
index 12291d64cb3..f0c3b072e47 100644
--- a/src/rgw/rgw_op_type.h
+++ b/src/rgw/rgw_op_type.h
@@ -25,6 +25,7 @@ enum RGWOpType {
   RGW_OP_PUT_METADATA_BUCKET,
   RGW_OP_PUT_METADATA_OBJECT,
   RGW_OP_SET_TEMPURL,
+  RGW_OP_RESTORE_OBJ,
   RGW_OP_DELETE_OBJ,
   RGW_OP_COPY_OBJ,
   RGW_OP_GET_ACLS,
diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc
index 92b65b0ebba..cb68d72d7da 100644
--- a/src/rgw/rgw_pubsub.cc
+++ b/src/rgw/rgw_pubsub.cc
@@ -570,22 +570,16 @@ RGWPubSub::RGWPubSub(rgw::sal::Driver* _driver,
 {
 }
 
-int RGWPubSub::get_topics(const DoutPrefixProvider* dpp,
-                          const std::string& start_marker, int max_items,
-                          rgw_pubsub_topics& result, std::string& next_marker,
-                          optional_yield y) const
+int RGWPubSub::get_topics_v2(const DoutPrefixProvider* dpp,
+                             const std::string& start_marker, int max_items,
+                             rgw_pubsub_topics& result, std::string& next_marker,
+                             optional_yield y) const
 {
   if (rgw::account::validate_id(tenant)) {
     // if our tenant is an account, return the account listing
     return list_account_topics(dpp, start_marker, max_items,
                                result, next_marker, y);
   }
-
-  if (!use_notification_v2 || driver->stat_topics_v1(tenant, y, dpp) != -ENOENT) {
-    // in case of v1 or during migration we use v1 topics
-    // v1 returns all topics, ignoring marker/max_items
-    return read_topics_v1(dpp, result, nullptr, y);
-  }
  
   // TODO: prefix filter on 'tenant:'
   void* handle = NULL;
@@ -629,6 +623,13 @@ int RGWPubSub::get_topics(const DoutPrefixProvider* dpp,
   return ret;
 }
 
+int RGWPubSub::get_topics_v1(const DoutPrefixProvider* dpp,
+                             rgw_pubsub_topics& result,
+                             optional_yield y) const
+{
+  return read_topics_v1(dpp, result, nullptr, y);
+}
+
 int RGWPubSub::list_account_topics(const DoutPrefixProvider* dpp,
                                    const std::string& start_marker,
                                    int max_items, rgw_pubsub_topics& result,
diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h
index b7ce443af03..8a6b290cb85 100644
--- a/src/rgw/rgw_pubsub.h
+++ b/src/rgw/rgw_pubsub.h
@@ -643,9 +643,14 @@ public:
 
   // get a paginated list of topics
   // return 0 on success, error code otherwise
-  int get_topics(const DoutPrefixProvider* dpp,
-                 const std::string& start_marker, int max_items,
-                 rgw_pubsub_topics& result, std::string& next_marker,
+  int get_topics_v2(const DoutPrefixProvider* dpp,
+                    const std::string& start_marker, int max_items,
+                    rgw_pubsub_topics& result, std::string& next_marker,
+                    optional_yield y) const;
+
+  // return 0 on success, error code otherwise
+  int get_topics_v1(const DoutPrefixProvider* dpp,
+                 rgw_pubsub_topics& result,
                  optional_yield y) const;
 
   // get a topic with by its name and populate it into "result"
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index fae60c50f4d..3abba0124a6 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -318,6 +318,12 @@ public:
   ~RGWPutMetadataObject_ObjStore() override {}
 };
 
+class RGWRestoreObj_ObjStore : public RGWRestoreObj {
+public:
+  RGWRestoreObj_ObjStore() {}
+  ~RGWRestoreObj_ObjStore() override {}
+};
+
 class RGWDeleteObj_ObjStore : public RGWDeleteObj {
 public:
   RGWDeleteObj_ObjStore() {}
diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc
index 45b5e3076f4..c16064a61c2 100644
--- a/src/rgw/rgw_rest_client.cc
+++ b/src/rgw/rgw_rest_client.cc
@@ -209,7 +209,7 @@ static int sign_request_v2(const DoutPrefixProvider *dpp, const RGWAccessKey& ke
   }
 
   string canonical_header;
-  if (!rgw_create_s3_canonical_header(dpp, info, NULL, canonical_header, false)) {
+  if (!rgw_create_s3_canonical_header(dpp, RGW_OP_UNKNOWN, info, NULL, canonical_header, false)) {
     ldpp_dout(dpp, 0) << "failed to create canonical s3 header" << dendl;
     return -EINVAL;
   }
diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc
index a3784ca95b0..c0345a4f88a 100644
--- a/src/rgw/rgw_rest_pubsub.cc
+++ b/src/rgw/rgw_rest_pubsub.cc
@@ -493,8 +493,13 @@ void RGWPSListTopicsOp::execute(optional_yield y) {
   const std::string start_token = s->info.args.get("NextToken");
 
   const RGWPubSub ps(driver, get_account_or_tenant(s->owner.id), *s->penv.site);
-  constexpr int max_items = 100;
-  op_ret = ps.get_topics(this, start_token, max_items, result, next_token, y);
+  if (rgw::all_zonegroups_support(*s->penv.site, rgw::zone_features::notification_v2) &&
+      driver->stat_topics_v1(s->bucket->get_tenant(), null_yield, this) == -ENOENT) {
+    op_ret = ps.get_topics_v1(this, result, y);
+  } else {
+    constexpr int max_items = 100;
+    op_ret = ps.get_topics_v2(this, start_token, max_items, result, next_token, y);
+  }
   // if there are no topics it is not considered an error
   op_ret = op_ret == -ENOENT ? 0 : op_ret;
   if (op_ret < 0) {
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 4a50baf1cb2..a245fca9945 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -519,6 +519,22 @@ int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs,
 	}
       }
     } /* checksum_mode */
+    auto attr_iter = attrs.find(RGW_ATTR_RESTORE_TYPE);
+    if (attr_iter != attrs.end()) {
+      rgw::sal::RGWRestoreType rt;
+      bufferlist bl = attr_iter->second;
+      auto iter = bl.cbegin();
+      decode(rt, iter);
+
+      if (rt == rgw::sal::RGWRestoreType::Temporary) {
+        // temporary restore; set storage-class to cloudtier storage class
+        auto c_iter = attrs.find(RGW_ATTR_CLOUDTIER_STORAGE_CLASS);
+
+        if (c_iter != attrs.end()) {
+          attrs[RGW_ATTR_STORAGE_CLASS] = c_iter->second;
+        }
+      }
+    }
 
     for (struct response_attr_param *p = resp_attr_params; p->param; p++) {
       bool exists;
@@ -3435,6 +3451,106 @@ int RGWPostObj_ObjStore_S3::get_encrypt_filter(
   return res;
 }
 
+struct RestoreObjectRequest {
+  std::optional<uint64_t> days;
+
+  void decode_xml(XMLObj *obj) {
+    RGWXMLDecoder::decode_xml("Days", days, obj);
+  }
+
+  void dump_xml(Formatter *f) const {
+    encode_xml("Days", days, f);
+  }
+};
+
+int RGWRestoreObj_ObjStore_S3::get_params(optional_yield y)
+{ 
+  std::string expected_bucket_owner;
+
+  if (s->info.env->get("x-amz-expected-bucket-owner") != nullptr) {
+    expected_bucket_owner = s->info.env->get("x-amz-expected-bucket-owner");
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  RGWXMLDecoder::XMLParser parser;
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, max_size, false);
+
+  if (r < 0) {
+    return r;
+  }
+
+  if(!parser.init()) {
+    return -EINVAL;
+  }
+
+   if (!parser.parse(data.c_str(), data.length(), 1)) {
+    return -ERR_MALFORMED_XML;
+  }
+
+  RestoreObjectRequest request;
+
+  try {
+    RGWXMLDecoder::decode_xml("RestoreRequest", request, &parser);
+  }
+  catch (RGWXMLDecoder::err &err) {
+    ldpp_dout(this, 5) << "Malformed restore request: " << err << dendl;
+    return -EINVAL;
+  }
+
+  if (request.days) {
+    expiry_days = request.days.value();
+    ldpp_dout(this, 10) << "expiry_days=" << expiry_days << dendl;
+  } else {
+    expiry_days=nullopt;
+    ldpp_dout(this, 10) << "expiry_days=" << expiry_days << dendl;
+  }
+
+  return 0;
+}
+
+void RGWRestoreObj_ObjStore_S3::send_response()
+{
+  if (op_ret < 0)
+  {
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    end_header(s, this);
+    dump_start(s);
+    return;
+  }
+
+  rgw::sal::Attrs attrs = s->object->get_attrs();
+  auto attr_iter = attrs.find(RGW_ATTR_RESTORE_STATUS);
+  rgw::sal::RGWRestoreStatus restore_status;
+  if (attr_iter != attrs.end()) {
+    bufferlist bl = attr_iter->second;
+    auto iter = bl.cbegin();
+    decode(restore_status, iter);
+  }
+  ldpp_dout(this, 10) << "restore_status=" << restore_status << dendl;
+  
+  if (attr_iter == attrs.end() || restore_status != rgw::sal::RGWRestoreStatus::None) {
+    s->err.http_ret = 202; //Accepted
+    dump_header(s, "x-amz-restore", rgw_bl_str(restore_status));
+  } else if (restore_status != rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress) {
+    s->err.http_ret = 409; // Conflict
+    dump_header_if_nonempty(s, "x-amz-restore", rgw_bl_str(restore_status));
+  } else if (restore_status != rgw::sal::RGWRestoreStatus::CloudRestored) {
+    s->err.http_ret = 200; // OK
+    dump_header_if_nonempty(s, "x-amz-restore", rgw_bl_str(restore_status));
+  } else {
+    s->err.http_ret = 202; // Accepted
+    dump_header_if_nonempty(s, "x-amz-restore", rgw_bl_str(restore_status));
+  }
+
+  dump_errno(s);
+  end_header(s, this);
+  dump_start(s);
+}
+
 int RGWDeleteObj_ObjStore_S3::get_params(optional_yield y)
 {
   const char *if_unmod = s->info.env->get("HTTP_X_AMZ_DELETE_IF_UNMODIFIED_SINCE");
@@ -4894,6 +5010,9 @@ RGWOp *RGWHandler_REST_Obj_S3::op_post()
   if (s->info.args.exists("uploads"))
     return new RGWInitMultipart_ObjStore_S3;
   
+  if (s->info.args.exists("restore"))
+    return new RGWRestoreObj_ObjStore_S3;
+  
   if (is_select_op())
     return rgw::s3select::create_s3select_op();
 
@@ -5845,7 +5964,7 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
   auto canonical_qs = rgw::auth::s3::get_v4_canonical_qs(s->info, using_qs);
 
   /* Craft canonical method. */
-  auto canonical_method = rgw::auth::s3::get_v4_canonical_method(s);
+  auto canonical_method = rgw::auth::s3::get_canonical_method(s, s->op_type, s->info);
 
   /* Craft canonical request. */
   auto canonical_req_hash = \
@@ -5945,6 +6064,7 @@ AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
 	case RGW_OP_PUT_BUCKET_TAGGING:
 	case RGW_OP_PUT_BUCKET_REPLICATION:
         case RGW_OP_PUT_LC:
+        case RGW_OP_RESTORE_OBJ:
         case RGW_OP_SET_REQUEST_PAYMENT:
         case RGW_OP_PUBSUB_NOTIF_CREATE:
         case RGW_OP_PUBSUB_NOTIF_DELETE:
@@ -6109,7 +6229,7 @@ AWSGeneralAbstractor::get_auth_data_v2(const req_state* const s) const
   /* Let's canonize the HTTP headers that are covered by the AWS auth v2. */
   std::string string_to_sign;
   utime_t header_time;
-  if (! rgw_create_s3_canonical_header(s, s->info, &header_time, string_to_sign,
+  if (! rgw_create_s3_canonical_header(s, s->op_type, s->info, &header_time, string_to_sign,
         qsr)) {
     ldpp_dout(s, 10) << "failed to create the canonized auth header\n"
                    << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl;
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index d86123a2525..63909f57036 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -327,6 +327,16 @@ public:
                          rgw::sal::DataProcessor *cb) override;
 };
 
+class RGWRestoreObj_ObjStore_S3 : public RGWRestoreObj_ObjStore {
+
+public:
+  RGWRestoreObj_ObjStore_S3() {}
+  ~RGWRestoreObj_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
 class RGWDeleteObj_ObjStore_S3 : public RGWDeleteObj_ObjStore {
 public:
   RGWDeleteObj_ObjStore_S3() {}
diff --git a/src/rgw/rgw_s3select.cc b/src/rgw/rgw_s3select.cc
index 800d276a6aa..f0b26824ca6 100644
--- a/src/rgw/rgw_s3select.cc
+++ b/src/rgw/rgw_s3select.cc
@@ -344,7 +344,7 @@ RGWSelectObj_ObjStore_S3::~RGWSelectObj_ObjStore_S3()
 
 int RGWSelectObj_ObjStore_S3::get_params(optional_yield y)
 {
-  if(m_s3select_query.empty() == false) {
+  if (m_s3select_query.empty() == false) {
     return 0;
   }
 #ifndef _ARROW_EXIST
@@ -416,14 +416,14 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_csv(const char* query, const char*
   if (output_escape_char.size()) {
     csv.output_escape_char = *output_escape_char.c_str();
   }
-  if(output_quote_fields.compare("ALWAYS") == 0) {
+  if (output_quote_fields.compare("ALWAYS") == 0) {
     csv.quote_fields_always = true;
-  } else if(output_quote_fields.compare("ASNEEDED") == 0) {
+  } else if (output_quote_fields.compare("ASNEEDED") == 0) {
     csv.quote_fields_asneeded = true;
   }
-  if(m_header_info.compare("IGNORE")==0) {
+  if (m_header_info.compare("IGNORE")==0) {
     csv.ignore_header_info=true;
-  } else if(m_header_info.compare("USE")==0) {
+  } else if (m_header_info.compare("USE")==0) {
     csv.use_header_info=true;
   }
 
@@ -478,6 +478,7 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_parquet(const char* query)
   if (!m_s3_parquet_object.is_set()) {
     //parsing the SQL statement.
     s3select_syntax.parse_query(m_sql_query.c_str());
+    parquet_object::csv_definitions parquet;
 
   m_s3_parquet_object.set_external_system_functions(fp_s3select_continue,
 						fp_s3select_result_format,
@@ -485,8 +486,10 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_parquet(const char* query)
 						fp_debug_mesg);
 
     try {
+      //setting the Parquet-reader properties. i.e. the buffer-size for the Parquet-reader
+      parquet::ceph::S3select_Config::getInstance().set_s3select_reader_properties(s->cct->_conf->rgw_parquet_buffer_size);
       //at this stage the Parquet-processing requires for the meta-data that reside on Parquet object 
-      m_s3_parquet_object.set_parquet_object(std::string("s3object"), &s3select_syntax, &m_rgw_api);
+      m_s3_parquet_object.set_parquet_object(std::string("s3object"), &s3select_syntax, &m_rgw_api, parquet);
     } catch(base_s3select_exception& e) {
       ldpp_dout(this, 10) << "S3select: failed upon parquet-reader construction: " << e.what() << dendl;
       fp_result_header_format(m_aws_response_handler.get_sql_result());
@@ -524,6 +527,7 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_json(const char* query, const char
 						fp_s3select_result_format,
 						fp_result_header_format,
 						fp_debug_mesg);
+  json_object::csv_definitions json;
 
   m_aws_response_handler.init_response();
 
@@ -547,8 +551,7 @@ int RGWSelectObj_ObjStore_S3::run_s3select_on_json(const char* query, const char
   }
     
   //initializing json processor
-  json_object::csv_definitions output_definition;
-  m_s3_json_object.set_json_query(&s3select_syntax,output_definition);
+  m_s3_json_object.set_json_query(&s3select_syntax, json);
 
   if (input == nullptr) {
     input = "";
@@ -706,6 +709,7 @@ int RGWSelectObj_ObjStore_S3::range_request(int64_t ofs, int64_t len, void* buff
   RGWGetObj::parse_range();
   requested_buffer.clear();
   m_request_range = len;
+  m_aws_response_handler.update_processed_size(len);
   ldout(s->cct, 10) << "S3select: calling execute(async):" << " request-offset :" << ofs << " request-length :" << len << " buffer size : " << requested_buffer.size() << dendl;
   RGWGetObj::execute(y);
   if (buff) {
@@ -730,7 +734,7 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
     m_aws_response_handler.set(s, this, fp_chunked_transfer_encoding);
   }
 
-  if(s->cct->_conf->rgw_disable_s3select == true)
+  if (s->cct->_conf->rgw_disable_s3select == true)
   {
       std::string error_msg="s3select : is disabled by rgw_disable_s3select configuration parameter";
       ldpp_dout(this, 10) << error_msg << dendl;
@@ -749,12 +753,24 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
       return;
     }
     s3select_syntax.parse_query(m_sql_query.c_str());
+    //the run_s3select_on_parquet() calling the s3select-query-engine, that read and process the parquet object with RGW::range_request, 
+    //upon query-engine finish the processing, the control is back to execute()
+    //the parquet-reader indicates the end of the parquet object.
     status = run_s3select_on_parquet(m_sql_query.c_str());
     if (status) {
       ldout(s->cct, 10) << "S3select: failed to process query <" << m_sql_query << "> on object " << s->object->get_name() << dendl;
       op_ret = -ERR_INVALID_REQUEST;
     } else {
-      ldout(s->cct, 10) << "S3select: complete query with success " << dendl;
+      //status per amount of processed data
+      m_aws_response_handler.update_total_bytes_returned(m_s3_parquet_object.get_return_result_size());
+      m_aws_response_handler.init_stats_response();
+      m_aws_response_handler.send_stats_response();
+      m_aws_response_handler.init_end_response();
+      ldpp_dout(this, 10) << "s3select : reached the end of parquet query request : aws_response_handler.get_processed_size() " 
+      << m_aws_response_handler.get_processed_size()
+      << "m_object_size_for_processing : " << m_object_size_for_processing << dendl;
+       
+      ldout(s->cct, 10) << "S3select: complete parquet query with success " << dendl;
     }
     } else { 
 	//CSV or JSON processing
@@ -762,7 +778,7 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
 
 	  m_requested_range = (m_end_scan_sz - m_start_scan_sz);
 	    
-	  if(m_is_trino_request){
+	  if (m_is_trino_request){
 	  // fetch more than requested(m_scan_offset), that additional bytes are scanned for end of row, 
 	  // thus the additional length will be processed, and no broken row for Trino.
 	  // assumption: row is smaller than m_scan_offset. (a different approach is to request for additional range)
@@ -778,7 +794,8 @@ void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
 }
 
 int RGWSelectObj_ObjStore_S3::parquet_processing(bufferlist& bl, off_t ofs, off_t len)
-{
+{//purpose: to process the returned buffer from range-request, and to send it to the Parquet-reader.
+ //range_request() is called by arrow::ReadAt, and upon request completion the control is back to RGWSelectObj_ObjStore_S3::execute()
     fp_chunked_transfer_encoding();
     size_t append_in_callback = 0;
     int part_no = 1;
@@ -809,7 +826,7 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
 //the purpose is to return "perfect" results, with no broken or missing lines.
 
   off_t new_offset = 0;
-  if(m_scan_range_ind){//only upon range-scan
+  if (m_scan_range_ind){//only upon range-scan
   int64_t sc=0;
   int64_t start =0;
   const char* row_delimiter = m_row_delimiter.c_str();
@@ -817,10 +834,10 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
     ldpp_dout(this, 10) << "s3select query: per Trino request the first and last chunk should modified." << dendl;
 
     //chop the head of the first chunk and only upon the slice does not include the head of the object.
-    if(m_start_scan_sz && (m_aws_response_handler.get_processed_size()==0)){
+    if (m_start_scan_sz && (m_aws_response_handler.get_processed_size()==0)){
       char* p = const_cast<char*>(it_cp+ofs);
       while(strncmp(row_delimiter,p,1) && (p - (it_cp+ofs)) < len)p++;
-      if(!strncmp(row_delimiter,p,1)){
+      if (!strncmp(row_delimiter,p,1)){
 	new_offset += (p - (it_cp+ofs))+1;
       } 
     }
@@ -831,14 +848,14 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
 
     //chop the end of the last chunk for this request
     //if it's the last chunk, search for first row-delimiter for the following different use-cases
-    if((m_aws_response_handler.get_processed_size()+len) >= m_requested_range){ 
+    if ((m_aws_response_handler.get_processed_size()+len) >= m_requested_range){ 
     //had pass the requested range, start to search for first delimiter
-      if(m_aws_response_handler.get_processed_size()>m_requested_range){
+      if (m_aws_response_handler.get_processed_size()>m_requested_range){
 	//the previous chunk contain the complete request(all data) and an extra bytes.
 	//thus, search for the first row-delimiter
 	//[:previous (RR) ... ][:current (RD) ]
 	start = 0;
-      } else if(m_aws_response_handler.get_processed_size()){
+      } else if (m_aws_response_handler.get_processed_size()){
 	//the *current* chunk contain the complete request in the middle of the chunk. 
 	//thus, search for the first row-delimiter after the complete request position
 	//[:current (RR) .... (RD) ]
@@ -852,7 +869,7 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
       for(sc=start;sc<len;sc++)//assumption : row-delimiter must exist or its end ebject
       {
 	char* p = const_cast<char*>(it_cp) + ofs + sc;
-	if(!strncmp(row_delimiter,p,1)){
+	if (!strncmp(row_delimiter,p,1)){
 	      ldout(s->cct, 10) << "S3select: found row-delimiter on " << sc << " get_processed_size = " << m_aws_response_handler.get_processed_size() <<  dendl;
 	      len = sc + 1;//+1 is for delimiter.  TODO what about m_object_size_for_processing (to update according to len)
 	      //the end of row exist in current chunk.
@@ -872,7 +889,7 @@ void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp,
 int RGWSelectObj_ObjStore_S3::csv_processing(bufferlist& bl, off_t ofs, off_t len)
 {
   int status = 0;
-  if(m_skip_next_chunk == true){
+  if (m_skip_next_chunk == true){
     return status;
   } 
 
@@ -894,13 +911,13 @@ int RGWSelectObj_ObjStore_S3::csv_processing(bufferlist& bl, off_t ofs, off_t le
       }
 
 
-      if(ofs > it.length()){
+      if (ofs > it.length()){
       //safety check
 	ldpp_dout(this, 10) << "offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl;
 	ofs = 0;
       }
 
-    if(m_is_trino_request){
+    if (m_is_trino_request){
       //TODO replace len with it.length() ? ; test Trino flow with compressed objects.
       //is it possible to send get-by-ranges? in parallel?
       shape_chunk_per_trino_requests(&(it)[0], ofs, len); 
@@ -964,7 +981,7 @@ int RGWSelectObj_ObjStore_S3::json_processing(bufferlist& bl, off_t ofs, off_t l
         continue;
       }
 
-      if((ofs + len) > it.length()){
+      if ((ofs + len) > it.length()){
 	ldpp_dout(this, 10) << "s3select: offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl;
 	ofs = 0;
 	len = it.length();
@@ -1025,7 +1042,7 @@ int RGWSelectObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t ofs, off_
   if (len == 0 && s->obj_size != 0) {
     return 0;
   }
-  if (m_parquet_type) {
+  if (m_parquet_type) {//bufferlist sendback upon range-request
     return parquet_processing(bl,ofs,len);
   }
   if (m_json_type) {
diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h
index f89aa6f4e66..769d7435442 100644
--- a/src/rgw/rgw_sal.h
+++ b/src/rgw/rgw_sal.h
@@ -153,6 +153,21 @@ enum AttrsMod {
 static constexpr uint32_t FLAG_LOG_OP = 0x0001;
 static constexpr uint32_t FLAG_PREVENT_VERSIONING = 0x0002;
 
+enum RGWRestoreStatus : uint8_t {
+  None  = 0,
+  RestoreAlreadyInProgress = 1,
+  CloudRestored = 2,
+  RestoreFailed = 3
+};
+
+
+enum class RGWRestoreType : uint8_t {
+  None = 0,
+  Temporary = 1,
+  Permanent = 2
+};
+
+
 // a simple streaming data processing abstraction
 /**
  * @brief A simple streaming data processing abstraction
@@ -1199,6 +1214,18 @@ class Object {
 			   bool update_object,
 			   const DoutPrefixProvider* dpp,
 			   optional_yield y) = 0;
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+         		   RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+			   uint32_t flags) = 0;
     /** Check to see if two placement rules match */
     virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) = 0;
     /** Dump driver-specific object layout info in JSON */
diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc
index 272862cb7e1..733bfa39ee2 100644
--- a/src/rgw/rgw_sal_filter.cc
+++ b/src/rgw/rgw_sal_filter.cc
@@ -1117,6 +1117,23 @@ int FilterObject::transition_to_cloud(Bucket* bucket,
 				   o, cloud_targets, cct, update_object, dpp, y);
 }
 
+int FilterObject::restore_obj_from_cloud(Bucket* bucket,
+		          rgw::sal::PlacementTier* tier,
+		          rgw_placement_rule& placement_rule,
+		          rgw_bucket_dir_entry& o,
+		          CephContext* cct,
+		          RGWObjTier& tier_config,
+		          real_time& mtime,
+		          uint64_t olh_epoch,
+		          std::optional<uint64_t> days,
+		          const DoutPrefixProvider* dpp, 
+		          optional_yield y,
+		          uint32_t flags)
+{
+  return next->restore_obj_from_cloud(nextBucket(bucket), nextPlacementTier(tier),
+           placement_rule, o, cct, tier_config, mtime, olh_epoch, days, dpp, y, flags);
+}
+
 bool FilterObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
 {
   return next->placement_rules_match(r1, r2);
diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h
index b12ea53a9bb..17b102f7619 100644
--- a/src/rgw/rgw_sal_filter.h
+++ b/src/rgw/rgw_sal_filter.h
@@ -789,6 +789,18 @@ public:
 				  bool update_object,
 				  const DoutPrefixProvider* dpp,
 				  optional_yield y) override;
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+		           RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override;
   virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
   virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y,
 			      Formatter* f) override;
diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h
index d9b2f80e1b6..47d031fbfc6 100644
--- a/src/rgw/rgw_sal_store.h
+++ b/src/rgw/rgw_sal_store.h
@@ -352,6 +352,20 @@ class StoreObject : public Object {
        * work with lifecycle */
       return -1;
     }
+    virtual int restore_obj_from_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_placement_rule& placement_rule,
+			   rgw_bucket_dir_entry& o,
+			   CephContext* cct,
+    		           RGWObjTier& tier_config,
+			   real_time& mtime,
+			   uint64_t olh_epoch,
+		           std::optional<uint64_t> days,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y,
+		           uint32_t flags) override {
+      return -1;
+    }
     jspan_context& get_trace() override { return trace_ctx; }
     void set_trace (jspan_context&& _trace_ctx) override { trace_ctx = std::move(_trace_ctx); }
 
diff --git a/src/rgw/rgw_zone.cc b/src/rgw/rgw_zone.cc
index 8d8b44cd961..1acaf9b3d4f 100644
--- a/src/rgw/rgw_zone.cc
+++ b/src/rgw/rgw_zone.cc
@@ -860,6 +860,8 @@ void RGWZoneGroupPlacementTier::decode_json(JSONObj *obj)
   JSONDecoder::decode_json("tier_type", tier_type, obj);
   JSONDecoder::decode_json("storage_class", storage_class, obj);
   JSONDecoder::decode_json("retain_head_object", retain_head_object, obj);
+  JSONDecoder::decode_json("allow_read_through", allow_read_through, obj);
+  JSONDecoder::decode_json("read_through_restore_days", read_through_restore_days, obj);
 
   if (tier_type == "cloud-s3") {
     JSONDecoder::decode_json("s3", t.s3, obj);
@@ -897,6 +899,8 @@ void RGWZoneGroupPlacementTier::dump(Formatter *f) const
   encode_json("tier_type", tier_type, f);
   encode_json("storage_class", storage_class, f);
   encode_json("retain_head_object", retain_head_object, f);
+  encode_json("allow_read_through", allow_read_through, f);
+  encode_json("read_through_restore_days", read_through_restore_days, f);
 
   if (tier_type == "cloud-s3") {
     encode_json("s3", t.s3, f);
diff --git a/src/rgw/rgw_zone_types.h b/src/rgw/rgw_zone_types.h
index 13fce000c41..d44761d7f5a 100644
--- a/src/rgw/rgw_zone_types.h
+++ b/src/rgw/rgw_zone_types.h
@@ -543,9 +543,13 @@ struct RGWZoneGroupPlacementTierS3 {
 WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTierS3)
 
 struct RGWZoneGroupPlacementTier {
+#define DEFAULT_READ_THROUGH_RESTORE_DAYS 1
+
   std::string tier_type;
   std::string storage_class;
   bool retain_head_object = false;
+  bool allow_read_through = false;
+  uint64_t read_through_restore_days = 1;
 
   struct _tier {
     RGWZoneGroupPlacementTierS3 s3;
@@ -555,10 +559,12 @@ struct RGWZoneGroupPlacementTier {
   int clear_params(const JSONFormattable& config);
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     encode(tier_type, bl);
     encode(storage_class, bl);
     encode(retain_head_object, bl);
+    encode(allow_read_through, bl);
+    encode(read_through_restore_days, bl);
     if (tier_type == "cloud-s3") {
       encode(t.s3, bl);
     }
@@ -566,10 +572,14 @@ struct RGWZoneGroupPlacementTier {
   }
 
   void decode(bufferlist::const_iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     decode(tier_type, bl);
     decode(storage_class, bl);
     decode(retain_head_object, bl);
+    if (struct_v >= 2) {
+      decode(allow_read_through, bl);
+      decode(read_through_restore_days, bl);
+    }
     if (tier_type == "cloud-s3") {
       decode(t.s3, bl);
     }
diff --git a/src/rgw/services/svc_bi_rados.cc b/src/rgw/services/svc_bi_rados.cc
index 61d4edc0c98..b34e0c23e60 100644
--- a/src/rgw/services/svc_bi_rados.cc
+++ b/src/rgw/services/svc_bi_rados.cc
@@ -5,6 +5,7 @@
 #include "svc_bilog_rados.h"
 #include "svc_zone.h"
 
+#include "rgw_asio_thread.h"
 #include "rgw_bucket.h"
 #include "rgw_zone.h"
 #include "rgw_datalog.h"
@@ -339,6 +340,7 @@ int RGWSI_BucketIndex_RADOS::cls_bucket_head(const DoutPrefixProvider *dpp,
     list_results.emplace(iter.first, rgw_cls_list_ret());
   }
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   r = CLSRGWIssueGetDirHeader(index_pool, oids, list_results,
 			      cct->_conf->rgw_bucket_index_max_aio)();
   if (r < 0)
@@ -369,6 +371,7 @@ int RGWSI_BucketIndex_RADOS::init_index(const DoutPrefixProvider *dpp,
   map<int, string> bucket_objs;
   get_bucket_index_objects(dir_oid, idx_layout.layout.normal.num_shards, idx_layout.gen, &bucket_objs);
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   if (judge_support_logrecord) {
     return CLSRGWIssueBucketIndexInit2(index_pool,
                                        bucket_objs,
@@ -397,6 +400,7 @@ int RGWSI_BucketIndex_RADOS::clean_index(const DoutPrefixProvider *dpp, const RG
   get_bucket_index_objects(dir_oid, idx_layout.layout.normal.num_shards,
                            idx_layout.gen, &bucket_objs);
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   return CLSRGWIssueBucketIndexClean(index_pool,
 				     bucket_objs,
 				     cct->_conf->rgw_bucket_index_max_aio)();
diff --git a/src/rgw/services/svc_bilog_rados.cc b/src/rgw/services/svc_bilog_rados.cc
index a33eb508848..1212f104815 100644
--- a/src/rgw/services/svc_bilog_rados.cc
+++ b/src/rgw/services/svc_bilog_rados.cc
@@ -4,6 +4,7 @@
 #include "svc_bilog_rados.h"
 #include "svc_bi_rados.h"
 
+#include "rgw_asio_thread.h"
 #include "cls/rgw/cls_rgw_client.h"
 
 #define dout_subsys ceph_subsys_rgw
@@ -48,6 +49,7 @@ int RGWSI_BILog_RADOS::log_trim(const DoutPrefixProvider *dpp,
     return r;
   }
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   return CLSRGWIssueBILogTrim(index_pool, start_marker_mgr, end_marker_mgr, bucket_objs,
 			      cct->_conf->rgw_bucket_index_max_aio)();
 }
@@ -61,6 +63,7 @@ int RGWSI_BILog_RADOS::log_start(const DoutPrefixProvider *dpp, const RGWBucketI
   if (r < 0)
     return r;
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   return CLSRGWIssueResyncBucketBILog(index_pool, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
 }
 
@@ -73,6 +76,7 @@ int RGWSI_BILog_RADOS::log_stop(const DoutPrefixProvider *dpp, const RGWBucketIn
   if (r < 0)
     return r;
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   return CLSRGWIssueBucketBILogStop(index_pool, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
 }
 
@@ -113,6 +117,7 @@ int RGWSI_BILog_RADOS::log_list(const DoutPrefixProvider *dpp,
   if (r < 0)
     return r;
 
+  maybe_warn_about_blocking(dpp); // TODO: use AioTrottle
   r = CLSRGWIssueBILogList(index_pool, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
   if (r < 0)
     return r;
diff --git a/src/rgw/services/svc_sys_obj_core.cc b/src/rgw/services/svc_sys_obj_core.cc
index 397709c5d99..cdbbf353832 100644
--- a/src/rgw/services/svc_sys_obj_core.cc
+++ b/src/rgw/services/svc_sys_obj_core.cc
@@ -169,21 +169,21 @@ int RGWSI_SysObj_Core::read(const DoutPrefixProvider *dpp,
     }
   }
 
-  rgw_rados_ref rados_obj;
-  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  rgw_rados_ref ref;
+  int r = get_rados_obj(dpp, zone_svc, obj, &ref);
   if (r < 0) {
     ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
     return r;
   }
-  r = rados_obj.operate(dpp, &op, nullptr, y);
+
+  version_t op_ver = 0;
+  r = rgw_rados_operate(dpp, ref.ioctx, obj.oid, &op, nullptr, y, 0, nullptr, &op_ver);
   if (r < 0) {
     ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl;
     return r;
   }
   ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl;
 
-  uint64_t op_ver = rados_obj.ioctx.get_last_version();
-
   if (read_state.last_ver > 0 &&
       read_state.last_ver != op_ver) {
     ldpp_dout(dpp, 5) << "raced with an object write, abort" << dendl;
diff --git a/src/s3select b/src/s3select
-Subproject f333ec82e6e8a3f7eb9ba1041d1442b2c7cd0f0
+Subproject 0a0f6d439441f5b121ed1052dac54542e4f1d89
diff --git a/src/test/cli-integration/rbd/gwcli_create.t b/src/test/cli-integration/rbd/gwcli_create.t
index b464681fba0..44c75082c94 100644
--- a/src/test/cli-integration/rbd/gwcli_create.t
+++ b/src/test/cli-integration/rbd/gwcli_create.t
@@ -1,43 +1,50 @@
-Podman find iSCSI container
-===========================
-  $ ISCSI_CONTAINER=$(sudo podman ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
+Cephadm prefers podman to docker
+================================
+  $ CENGINE=docker
+  > if command -v podman >/dev/null; then
+  >   CENGINE=podman
+  > fi
+
+Find iSCSI container
+====================
+  $ ISCSI_CONTAINER=$(sudo $CENGINE ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
 
 Dismiss the "could not load preferences file .gwcli/prefs.bin" warning
 ======================================================================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
 
 Create a datapool/block0 disk
 =============================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli disks/ create pool=datapool image=block0 size=300M wwn=36001405da17b74481464e9fa968746d3
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli disks/ create pool=datapool image=block0 size=300M wwn=36001405da17b74481464e9fa968746d3
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
   300M, Disks: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- datapool' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- datapool' | awk -F'[' '{print $2}'
   datapool (300M)]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- block0' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- block0' | awk -F'[' '{print $2}'
   datapool/block0 (Unknown, 300M)]
 
 Create the target IQN
 =====================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/ create target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/ create target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
   DiscoveryAuth: None, Targets: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.2003-01.com.redhat.iscsi-gw:ceph-gw' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.2003-01.com.redhat.iscsi-gw:ceph-gw' | awk -F'[' '{print $2}'
   Auth: None, Gateways: 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
   Disks: 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
   Up: 0/0, Portals: 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- host-groups' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- host-groups' | awk -F'[' '{print $2}'
   Groups : 0]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 0]
 
 Create the first gateway
 ========================
   $ HOST=$(python3 -c "import socket; print(socket.getfqdn())")
   > IP=`hostname -i | awk '{print $1}'`
-  > sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
+  > sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
   Up: 1/1, Portals: 1]
 
 Create the second gateway
@@ -45,34 +52,34 @@ Create the second gateway
   $ IP=`cat /etc/ceph/iscsi-gateway.cfg |grep 'trusted_ip_list' | awk -F'[, ]' '{print $3}'`
   > if [ "$IP" != `hostname -i | awk '{print $1}'` ]; then
   >   HOST=$(python3 -c "import socket; print(socket.getfqdn('$IP'))")
-  >   sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
+  >   sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
   > fi
   $ IP=`cat /etc/ceph/iscsi-gateway.cfg |grep 'trusted_ip_list' | awk -F'[, ]' '{print $4}'`
   > if [ "$IP" != `hostname -i | awk '{print $1}'` ]; then
   >   HOST=$(python3 -c "import socket; print(socket.getfqdn('$IP'))")
-  >   sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
+  >   sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/gateways create ip_addresses=$IP gateway_name=$HOST
   > fi
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- gateways' | awk -F'[' '{print $2}'
   Up: 2/2, Portals: 2]
 
 Attach the disk
 ===============
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ add disk=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ add disk=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
   Disks: 1]
 
 Create a host
 =============
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts create client_iqn=iqn.1994-05.com.redhat:client
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts create client_iqn=iqn.1994-05.com.redhat:client
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
   Auth: None, Disks: 0(0.00Y)]
 
 Map the LUN
 ===========
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts/iqn.1994-05.com.redhat:client disk disk=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts/iqn.1994-05.com.redhat:client disk disk=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 1]
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iqn.1994-05.com.redhat:client' | awk -F'[' '{print $2}'
   Auth: None, Disks: 1(300M)]
diff --git a/src/test/cli-integration/rbd/gwcli_delete.t b/src/test/cli-integration/rbd/gwcli_delete.t
index e973d87a39f..64f75acdd56 100644
--- a/src/test/cli-integration/rbd/gwcli_delete.t
+++ b/src/test/cli-integration/rbd/gwcli_delete.t
@@ -1,31 +1,38 @@
-Podman find iSCSI container
-===========================
-  $ ISCSI_CONTAINER=$(sudo podman ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
+Cephadm prefers podman to docker
+================================
+  $ CENGINE=docker
+  > if command -v podman >/dev/null; then
+  >   CENGINE=podman
+  > fi
+
+Find iSCSI container
+====================
+  $ ISCSI_CONTAINER=$(sudo $CENGINE ps -a | grep -F 'iscsi' | grep -Fv 'tcmu' | awk '{print $1}')
 
 Dismiss the "could not load preferences file .gwcli/prefs.bin" warning
 ======================================================================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls >/dev/null 2>&1
 
 Delete the host
 ===============
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts delete client_iqn=iqn.1994-05.com.redhat:client
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/hosts delete client_iqn=iqn.1994-05.com.redhat:client
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- hosts' | awk -F'[' '{print $2}'
   Auth: ACL_ENABLED, Hosts: 0]
 
 Delete the iscsi-targets disk
 =============================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ delete disk=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/iqn.2003-01.com.redhat.iscsi-gw:ceph-gw/disks/ delete disk=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- disks' | awk -F'[' '{print $2}'
   Disks: 0]
 
 Delete the target IQN
 =====================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli iscsi-targets/ delete target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli iscsi-targets/ delete target_iqn=iqn.2003-01.com.redhat.iscsi-gw:ceph-gw
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls iscsi-targets/ | grep 'o- iscsi-targets' | awk -F'[' '{print $2}'
   DiscoveryAuth: None, Targets: 0]
 
 Delete the disks
 ================
-  $ sudo podman exec $ISCSI_CONTAINER gwcli disks/ delete image_id=datapool/block0
-  $ sudo podman exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli disks/ delete image_id=datapool/block0
+  $ sudo $CENGINE exec $ISCSI_CONTAINER gwcli ls disks/ | grep 'o- disks' | awk -F'[' '{print $2}'
   0.00Y, Disks: 0]
diff --git a/src/test/cli-integration/rbd/iscsi_client.t b/src/test/cli-integration/rbd/iscsi_client.t
index f636d540d89..9a659e49eca 100644
--- a/src/test/cli-integration/rbd/iscsi_client.t
+++ b/src/test/cli-integration/rbd/iscsi_client.t
@@ -1,7 +1,7 @@
 Login to the target
 ===================
   $ IP=`cat /etc/ceph/iscsi-gateway.cfg |grep 'trusted_ip_list' | awk -F'[, ]' '{print $3}'`
-  > sudo iscsiadm -m discovery -t st -p $IP -l 2&> /dev/null
+  $ sudo iscsiadm -m discovery -t st -p $IP -l >/dev/null 2>&1
   $ sleep 10
   $ sudo ls /dev/disk/by-path/ |grep 'iscsi-iqn.2003-01.com.redhat.iscsi-gw:ceph-gw' |wc -l
   2
diff --git a/src/test/client/nonblocking.cc b/src/test/client/nonblocking.cc
index d4aecb10ffc..93bcfabd3fc 100644
--- a/src/test/client/nonblocking.cc
+++ b/src/test/client/nonblocking.cc
@@ -111,6 +111,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_a = iov_out_a[0].iov_len + iov_out_a[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_a, 2, 100, true, writefinish.get(), nullptr);
   ASSERT_EQ(0, rc);
@@ -130,6 +132,8 @@ TEST_F(TestClient, LlreadvLlwritev) {
   writefinish.reset(new C_SaferCond("test-nonblocking-writefinish"));
   readfinish.reset(new C_SaferCond("test-nonblocking-readfinish"));
   ssize_t nwritten_b = iov_out_b[0].iov_len + iov_out_b[1].iov_len;
+  // reset bufferlist
+  bl.clear();
 
   rc = client->ll_preadv_pwritev(fh, iov_out_b, 2, 1000, true, writefinish.get(), nullptr, true, false);
   ASSERT_EQ(0, rc);
diff --git a/src/test/common/test_json_formatter.cc b/src/test/common/test_json_formatter.cc
index 8a0f547a929..9cc19b24ad1 100644
--- a/src/test/common/test_json_formatter.cc
+++ b/src/test/common/test_json_formatter.cc
@@ -17,6 +17,7 @@
 
 #include "common/ceph_json.h"
 #include "common/Clock.h"
+#include "common/StackStringStream.h"
 
 #include <sstream>
 
@@ -79,3 +80,25 @@ TEST(formatter, utime)
   EXPECT_EQ(input.sec(), output.sec());
   EXPECT_EQ(input.nsec(), output.nsec());
 }
+
+TEST(formatter, dump_inf_or_nan)
+{
+  JSONFormatter formatter;
+  formatter.open_object_section("inf_and_nan");
+  double inf = std::numeric_limits<double>::infinity();
+  formatter.dump_float("positive_infinity", inf);
+  formatter.dump_float("negative_infinity", -inf);
+  formatter.dump_float("nan_val", std::numeric_limits<double>::quiet_NaN());
+  formatter.dump_float("nan_val_alt", std::nan(""));
+  formatter.close_section();
+  bufferlist bl;
+  formatter.flush(bl);
+  std::cout << std::string(bl.c_str(), bl.length()) << std::endl;
+  JSONParser parser;
+  parser.parse(bl.c_str(), bl.length());
+  EXPECT_TRUE(parser.parse(bl.c_str(), bl.length()));
+  EXPECT_EQ(parser.find_obj("positive_infinity")->get_data(), "null");
+  EXPECT_EQ(parser.find_obj("negative_infinity")->get_data(), "null");
+  EXPECT_EQ(parser.find_obj("nan_val")->get_data(), "null");
+  EXPECT_EQ(parser.find_obj("nan_val_alt")->get_data(), "null");
+}
diff --git a/src/test/common/test_mutex_debug.cc b/src/test/common/test_mutex_debug.cc
index 977dfe738a9..cee4b427770 100644
--- a/src/test/common/test_mutex_debug.cc
+++ b/src/test/common/test_mutex_debug.cc
@@ -1,5 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 &smarttab
+// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *
@@ -57,21 +57,13 @@ TEST(MutexDebug, Lock) {
   test_lock<ceph::mutex_debug>();
 }
 
-TEST(MutexDebug, NotRecursive) {
+TEST(MutexDebugDeathTest, NotRecursive) {
   ceph::mutex_debug m("foo");
-  auto ttl = &test_try_lock<mutex_debug>;
-
-  ASSERT_NO_THROW(m.lock());
-  ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_THROW(m.lock(), std::system_error);
+  // avoid assert during test cleanup where the mutex is locked and cannot be
+  // pthread_mutex_destroy'd
+  std::unique_lock locker{m};
   ASSERT_TRUE(m.is_locked());
-  ASSERT_FALSE(std::async(std::launch::async, ttl, &m).get());
-
-  ASSERT_NO_THROW(m.unlock());
-  ASSERT_FALSE(m.is_locked());
-  ASSERT_TRUE(std::async(std::launch::async, ttl, &m).get());
+  ASSERT_DEATH(m.lock(), "FAILED ceph_assert(recursive || !is_locked_by_me())");
 }
 
 TEST(MutexRecursiveDebug, Lock) {
diff --git a/src/test/crimson/seastar_runner.h b/src/test/crimson/seastar_runner.h
index 63cc50d9f05..590eef13adf 100644
--- a/src/test/crimson/seastar_runner.h
+++ b/src/test/crimson/seastar_runner.h
@@ -71,6 +71,19 @@ struct SeastarRunner {
     auto ret = app.run(argc, argv, [this] {
       on_end.reset(new seastar::readable_eventfd);
       return seastar::now().then([this] {
+// FIXME: The stall detector uses glibc backtrace function to
+// collect backtraces, this causes ASAN failures on ARM.
+// For now we just extend timeout duration to 10000h in order to
+// get the same effect as disabling the stall detector which is not provided by seastar.
+// the ticket about migrating to libunwind: https://github.com/scylladb/seastar/issues/1878
+// Will remove once the ticket fixed.
+// Ceph ticket see: https://tracker.ceph.com/issues/65635
+#ifdef __aarch64__
+	seastar::smp::invoke_on_all([] {
+	  using namespace std::chrono;
+	  seastar::engine().update_blocked_reactor_notify_ms(duration_cast<milliseconds>(10000h));
+	}).get();
+#endif
 	begin_signaled = true;
 	[[maybe_unused]] auto r = ::eventfd_write(begin_fd.get(), APP_RUNNING);
 	assert(r == 0);
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index 6648719c61c..df743327aaa 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -128,7 +128,8 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
 
   void enqueue_push(
     const hobject_t& obj,
-    const eversion_t& v) override;
+    const eversion_t& v,
+    const std::vector<pg_shard_t> &peers) override;
 
   void enqueue_drop(
     const pg_shard_t& target,
@@ -243,6 +244,10 @@ struct BackfillFixture::PeeringFacade
   void update_complete_backfill_object_stats(const hobject_t &hoid,
                                              const pg_stat_t &stats) override {
   }
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers) override {}
   bool is_backfilling() const override {
     return true;
   }
@@ -270,6 +275,9 @@ BackfillFixture::BackfillFixture(
                                                    this->backfill_targets),
                    std::make_unique<PGFacade>(this->backfill_source))
 {
+  seastar::global_logger_registry().set_all_loggers_level(
+    seastar::log_level::debug
+  );
   backfill_state.process_event(crimson::osd::BackfillState::Triggered{}.intrusive_from_this());
 }
 
@@ -303,7 +311,8 @@ void BackfillFixture::request_primary_scan(
 
 void BackfillFixture::enqueue_push(
   const hobject_t& obj,
-  const eversion_t& v)
+  const eversion_t& v,
+  const std::vector<pg_shard_t> &)
 {
   for (auto& [ _, bt ] : backfill_targets) {
     bt.store.push(obj, v);
diff --git a/src/test/exporter/test_exporter.cc b/src/test/exporter/test_exporter.cc
index 907884fe35d..e24773886bc 100644
--- a/src/test/exporter/test_exporter.cc
+++ b/src/test/exporter/test_exporter.cc
@@ -1,6 +1,8 @@
 #include "common/ceph_argparse.h"
 #include "common/config.h"
 #include "common/config_proxy.h"
+#include "common/admin_socket.h"
+#include "common/admin_socket_client.h"
 #include <gmock/gmock.h>
 #include "gtest/gtest.h"
 #include "common/ceph_context.h"
@@ -8,6 +10,7 @@
 #include "global/global_init.h"
 #include "exporter/util.h"
 #include "exporter/DaemonMetricCollector.h"
+#include <filesystem>
 
 #include <regex>
 #include <string>
@@ -674,6 +677,27 @@ static std::vector<std::pair<std::string, std::string>> promethize_data = {
   {"rocksdb.submit_sync_latency_sum", "ceph_rocksdb_submit_sync_latency_sum"}
 };
 
+
+class AdminSocketTest
+{
+public:
+  explicit AdminSocketTest(AdminSocket *asokc)
+    : m_asokc(asokc)
+  {
+  }
+  bool init(const std::string &uri) {
+    return m_asokc->init(uri);
+  }
+  std::string bind_and_listen(const std::string &sock_path, int *fd) {
+    return m_asokc->bind_and_listen(sock_path, fd);
+  }
+  bool shutdown() {
+    m_asokc->shutdown();
+    return true;
+  }
+  AdminSocket *m_asokc;
+};
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
@@ -1289,8 +1313,11 @@ ceph_mon_session_rm{ceph_daemon="mon.a"} 577
 # TYPE ceph_mon_session_trim counter
 ceph_mon_session_trim{ceph_daemon="mon.a"} 9
 )";
-  
-  ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
+
+  std::string actualMetrics = collector.metrics;
+  std::cout << "Actual MON Metrics: " << actualMetrics << std::endl;
+  ASSERT_TRUE(actualMetrics.find(expectedMetrics) != std::string::npos);
+  //ASSERT_TRUE(collector.metrics.find(expectedMetrics) != std::string::npos);
 
   // Test for labeled metrics - RGW
   daemon = "ceph-client.rgw.foo.ceph-node-00.aayrrj.2.93993527376064";
@@ -1452,3 +1479,82 @@ TEST(Exporter, add_fixed_name_metrics) {
     EXPECT_EQ(new_metric.first, expected_labels);
     ASSERT_TRUE(new_metric.second == expected_metric_name);
 }
+
+TEST(Exporter, UpdateSockets) {
+    const std::string mock_dir = "/tmp/fake_sock_dir";
+
+    // Create the mock directory
+    std::filesystem::create_directories(mock_dir);
+
+    // Create a mix of vstart and real cluster mock .asok files
+    std::ofstream(mock_dir + "/ceph-osd.0.asok").close();
+    std::ofstream(mock_dir + "/ceph-mds.a.asok").close();
+    std::ofstream(mock_dir + "/ceph-mgr.chatest-node-00.ijzynn.asok").close();
+    std::ofstream(mock_dir + "/ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952.asok").close();
+    std::ofstream(mock_dir + "/ceph-client.ceph-exporter.chatest-node-00.asok").close();
+    std::ofstream(mock_dir + "/ceph-mon.chatest-node-00.asok").close();
+
+    g_conf().set_val("exporter_sock_dir", mock_dir);
+
+    DaemonMetricCollector collector;
+
+    // Run the function that interacts with the mock directory
+    collector.update_sockets();
+
+    // Verify the expected results
+    ASSERT_EQ(collector.clients.size(), 4);
+    ASSERT_TRUE(collector.clients.find("ceph-osd.0") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-mds.a") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-mon.chatest-node-00") != collector.clients.end());
+    ASSERT_TRUE(collector.clients.find("ceph-client.rgw.rgwfoo.chatest-node-00.yqaoen.2.94354846193952") != collector.clients.end());
+
+
+    // Remove the mock directory and files
+    std::filesystem::remove_all(mock_dir);
+}
+
+
+TEST(Exporter, HealthMetrics) {
+    std::map<std::string, AdminSocketClient> clients;
+    DaemonMetricCollector &collector = collector_instance();
+    std::string daemon = "test_daemon";
+    std::string expectedCounterDump = "";
+    std::string expectedCounterSchema = "";
+    std::string metricName = "ceph_daemon_socket_up";
+
+    // Fake admin socket
+    std::string asok_path = "/tmp/" + daemon + ".asok";
+    std::unique_ptr<AdminSocket> asokc = std::make_unique<AdminSocket>(g_ceph_context);
+    AdminSocketClient client(asok_path);
+
+    // Add the daemon clients to the collector
+    clients.insert({daemon, std::move(client)});
+    collector.clients = clients;
+
+    auto verifyMetricValue = [&](const std::string &metricValue, bool shouldInitializeSocket) {
+        collector.metrics = "";
+
+        if (shouldInitializeSocket) {
+            AdminSocketTest asoct(asokc.get());
+            ASSERT_TRUE(asoct.init(asok_path));
+        }
+
+        collector.dump_asok_metrics(true, 5, true, expectedCounterDump, expectedCounterSchema, false);
+
+        if (shouldInitializeSocket) {
+            AdminSocketTest asoct(asokc.get());
+            ASSERT_TRUE(asoct.shutdown());
+        }
+
+        std::string retrievedMetrics = collector.metrics;
+        std::string pattern = metricName + R"(\{[^}]*ceph_daemon=\")" + daemon + R"(\"[^}]*\}\s+)" + metricValue + R"(\b)";
+        std::regex regexPattern(pattern);
+        ASSERT_TRUE(std::regex_search(retrievedMetrics, regexPattern));
+    };
+
+    // Test an admin socket not answering: metric value should be "0"
+    verifyMetricValue("0", false);
+
+    // Test an admin socket answering: metric value should be "1"
+    verifyMetricValue("1", true);
+}
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index f2c87168633..6f10d2bbd4e 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -976,6 +976,13 @@ TEST(LibCephFS, Symlinks) {
   fd = ceph_open(cmount, test_symlink, O_NOFOLLOW, 0);
   ASSERT_EQ(fd, -CEPHFS_ELOOP);
 
+#if defined(__linux__) && defined(O_PATH)
+  // test the O_NOFOLLOW with O_PATH case
+  fd = ceph_open(cmount, test_symlink, O_PATH|O_NOFOLLOW, 0);
+  ASSERT_GT(fd, 0);
+  ceph_close(cmount, fd);
+#endif /* __linux */
+
   // stat the original file
   struct ceph_statx stx_orig;
   ASSERT_EQ(ceph_statx(cmount, test_file, &stx_orig, CEPH_STATX_ALL_STATS, 0), 0);
@@ -3012,6 +3019,18 @@ TEST(LibCephFS, Readlinkat) {
   ASSERT_EQ(0, memcmp(target, rel_file_path, target_len));
 
   ASSERT_EQ(0, ceph_close(cmount, fd));
+#if defined(__linux__) && defined(O_PATH)
+  // test readlinkat with empty pathname relative to O_PATH|O_NOFOLLOW fd
+  fd = ceph_open(cmount, link_path, O_PATH | O_NOFOLLOW, 0);
+  ASSERT_LE(0, fd);
+  size_t link_target_len = strlen(rel_file_path);
+  char link_target[link_target_len+1];
+  ASSERT_EQ(link_target_len, ceph_readlinkat(cmount, fd, "", link_target, link_target_len));
+  link_target[link_target_len] = '\0';
+  ASSERT_EQ(0, memcmp(link_target, rel_file_path, link_target_len));
+  ASSERT_EQ(0, ceph_close(cmount, fd));
+#endif /* __linux */
+
   ASSERT_EQ(0, ceph_unlink(cmount, link_path));
   ASSERT_EQ(0, ceph_unlink(cmount, file_path));
   ASSERT_EQ(0, ceph_rmdir(cmount, dir_path));
diff --git a/src/test/librados/asio.cc b/src/test/librados/asio.cc
index 9f8844eb7bb..01ebb957150 100644
--- a/src/test/librados/asio.cc
+++ b/src/test/librados/asio.cc
@@ -28,8 +28,6 @@
 #define dout_subsys ceph_subsys_rados
 #define dout_context g_ceph_context
 
-using namespace std;
-
 // test fixture for global setup/teardown
 class AsioRados : public ::testing::Test {
   static constexpr auto poolname = "ceph_test_rados_api_asio";
@@ -73,6 +71,9 @@ librados::Rados AsioRados::rados;
 librados::IoCtx AsioRados::io;
 librados::IoCtx AsioRados::snapio;
 
+using boost::system::error_code;
+using read_result = std::tuple<version_t, bufferlist>;
+
 void rethrow(std::exception_ptr eptr) {
   if (eptr) std::rethrow_exception(eptr);
 }
@@ -81,14 +82,17 @@ TEST_F(AsioRados, AsyncReadCallback)
 {
   boost::asio::io_context service;
 
-  auto success_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+  auto success_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   librados::async_read(service, io, "exist", 256, 0, success_cb);
 
-  auto failure_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+  auto failure_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
     EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+    EXPECT_EQ(0, ver);
+    EXPECT_EQ(0, bl.length());
   };
   librados::async_read(service, io, "noexist", 256, 0, failure_cb);
 
@@ -99,17 +103,17 @@ TEST_F(AsioRados, AsyncReadFuture)
 {
   boost::asio::io_context service;
 
-  std::future<bufferlist> f1 = librados::async_read(service, io, "exist", 256,
-                                                    0, boost::asio::use_future);
-  std::future<bufferlist> f2 = librados::async_read(service, io, "noexist", 256,
-                                                    0, boost::asio::use_future);
+  auto f1 = librados::async_read(service, io, "exist", 256,
+                                 0, boost::asio::use_future);
+  auto f2 = librados::async_read(service, io, "noexist", 256,
+                                 0, boost::asio::use_future);
 
   service.run();
 
-  EXPECT_NO_THROW({
-    auto bl = f1.get();
-    EXPECT_EQ("hello", bl.to_str());
-  });
+  auto [ver, bl] = f1.get();
+  EXPECT_LT(0, ver);
+  EXPECT_EQ("hello", bl.to_str());
+
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -118,17 +122,22 @@ TEST_F(AsioRados, AsyncReadYield)
   boost::asio::io_context service;
 
   auto success_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    auto bl = librados::async_read(service, io, "exist", 256, 0, yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_read(service, io, "exist", 256,
+                                          0, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   boost::asio::spawn(service, success_cr, rethrow);
 
   auto failure_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    auto bl = librados::async_read(service, io, "noexist", 256, 0, yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_read(service, io, "noexist", 256,
+                                          0, yield[ec]);
     EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+    EXPECT_EQ(0, ver);
+    EXPECT_EQ(0, bl.length());
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
@@ -142,14 +151,16 @@ TEST_F(AsioRados, AsyncWriteCallback)
   bufferlist bl;
   bl.append("hello");
 
-  auto success_cb = [&] (boost::system::error_code ec) {
+  auto success_cb = [&] (error_code ec, version_t ver) {
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
   };
   librados::async_write(service, io, "exist", bl, bl.length(), 0,
                         success_cb);
 
-  auto failure_cb = [&] (boost::system::error_code ec) {
+  auto failure_cb = [&] (error_code ec, version_t ver) {
     EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+    EXPECT_EQ(0, ver);
   };
   librados::async_write(service, snapio, "exist", bl, bl.length(), 0,
                         failure_cb);
@@ -171,7 +182,7 @@ TEST_F(AsioRados, AsyncWriteFuture)
 
   service.run();
 
-  EXPECT_NO_THROW(f1.get());
+  EXPECT_LT(0, f1.get());
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -183,19 +194,21 @@ TEST_F(AsioRados, AsyncWriteYield)
   bl.append("hello");
 
   auto success_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    librados::async_write(service, io, "exist", bl, bl.length(), 0,
-                          yield[ec]);
+    error_code ec;
+    auto ver = librados::async_write(service, io, "exist", bl,
+                                     bl.length(), 0, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   boost::asio::spawn(service, success_cr, rethrow);
 
   auto failure_cr = [&] (boost::asio::yield_context yield) {
-    boost::system::error_code ec;
-    librados::async_write(service, snapio, "exist", bl, bl.length(), 0,
-                          yield[ec]);
+    error_code ec;
+    auto ver = librados::async_write(service, snapio, "exist", bl,
+                                     bl.length(), 0, yield[ec]);
     EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+    EXPECT_EQ(0, ver);
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
@@ -208,8 +221,9 @@ TEST_F(AsioRados, AsyncReadOperationCallback)
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    auto success_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+    auto success_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
       EXPECT_FALSE(ec);
+      EXPECT_LT(0, ver);
       EXPECT_EQ("hello", bl.to_str());
     };
     librados::async_operate(service, io, "exist", &op, 0, nullptr, success_cb);
@@ -217,8 +231,10 @@ TEST_F(AsioRados, AsyncReadOperationCallback)
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    auto failure_cb = [&] (boost::system::error_code ec, bufferlist bl) {
+    auto failure_cb = [&] (error_code ec, version_t ver, bufferlist bl) {
       EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+      EXPECT_EQ(0, ver);
+      EXPECT_EQ(0, bl.length());
     };
     librados::async_operate(service, io, "noexist", &op, 0, nullptr, failure_cb);
   }
@@ -228,14 +244,14 @@ TEST_F(AsioRados, AsyncReadOperationCallback)
 TEST_F(AsioRados, AsyncReadOperationFuture)
 {
   boost::asio::io_context service;
-  std::future<bufferlist> f1;
+  std::future<read_result> f1;
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
     f1 = librados::async_operate(service, io, "exist", &op, 0, nullptr,
                                  boost::asio::use_future);
   }
-  std::future<bufferlist> f2;
+  std::future<read_result> f2;
   {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
@@ -244,10 +260,10 @@ TEST_F(AsioRados, AsyncReadOperationFuture)
   }
   service.run();
 
-  EXPECT_NO_THROW({
-    auto bl = f1.get();
-    EXPECT_EQ("hello", bl.to_str());
-  });
+  auto [ver, bl] = f1.get();
+  EXPECT_LT(0, ver);
+  EXPECT_EQ("hello", bl.to_str());
+
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -258,10 +274,11 @@ TEST_F(AsioRados, AsyncReadOperationYield)
   auto success_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    boost::system::error_code ec;
-    auto bl = librados::async_operate(service, io, "exist", &op, 0, nullptr,
-                                      yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_operate(service, io, "exist", &op,
+                                             0, nullptr, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
     EXPECT_EQ("hello", bl.to_str());
   };
   boost::asio::spawn(service, success_cr, rethrow);
@@ -269,10 +286,12 @@ TEST_F(AsioRados, AsyncReadOperationYield)
   auto failure_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectReadOperation op;
     op.read(0, 0, nullptr, nullptr);
-    boost::system::error_code ec;
-    auto bl = librados::async_operate(service, io, "noexist", &op, 0, nullptr,
-                                      yield[ec]);
+    error_code ec;
+    auto [ver, bl] = librados::async_operate(service, io, "noexist", &op,
+                                             0, nullptr, yield[ec]);
     EXPECT_EQ(boost::system::errc::no_such_file_or_directory, ec);
+    EXPECT_EQ(0, ver);
+    EXPECT_EQ(0, bl.length());
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
@@ -289,16 +308,18 @@ TEST_F(AsioRados, AsyncWriteOperationCallback)
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    auto success_cb = [&] (boost::system::error_code ec) {
+    auto success_cb = [&] (error_code ec, version_t ver) {
       EXPECT_FALSE(ec);
+      EXPECT_LT(0, ver);
     };
     librados::async_operate(service, io, "exist", &op, 0, nullptr, success_cb);
   }
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    auto failure_cb = [&] (boost::system::error_code ec) {
+    auto failure_cb = [&] (error_code ec, version_t ver) {
       EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+      EXPECT_EQ(0, ver);
     };
     librados::async_operate(service, snapio, "exist", &op, 0, nullptr, failure_cb);
   }
@@ -312,14 +333,14 @@ TEST_F(AsioRados, AsyncWriteOperationFuture)
   bufferlist bl;
   bl.append("hello");
 
-  std::future<void> f1;
+  std::future<version_t> f1;
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
     f1 = librados::async_operate(service, io, "exist", &op, 0, nullptr,
                                  boost::asio::use_future);
   }
-  std::future<void> f2;
+  std::future<version_t> f2;
   {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
@@ -328,7 +349,7 @@ TEST_F(AsioRados, AsyncWriteOperationFuture)
   }
   service.run();
 
-  EXPECT_NO_THROW(f1.get());
+  EXPECT_LT(0, f1.get());
   EXPECT_THROW(f2.get(), boost::system::system_error);
 }
 
@@ -342,18 +363,22 @@ TEST_F(AsioRados, AsyncWriteOperationYield)
   auto success_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    boost::system::error_code ec;
-    librados::async_operate(service, io, "exist", &op, 0, nullptr, yield[ec]);
+    error_code ec;
+    auto ver = librados::async_operate(service, io, "exist", &op,
+                                       0, nullptr, yield[ec]);
     EXPECT_FALSE(ec);
+    EXPECT_LT(0, ver);
   };
   boost::asio::spawn(service, success_cr, rethrow);
 
   auto failure_cr = [&] (boost::asio::yield_context yield) {
     librados::ObjectWriteOperation op;
     op.write_full(bl);
-    boost::system::error_code ec;
-    librados::async_operate(service, snapio, "exist", &op, 0, nullptr, yield[ec]);
+    error_code ec;
+    auto ver = librados::async_operate(service, snapio, "exist", &op,
+                                       0, nullptr, yield[ec]);
     EXPECT_EQ(boost::system::errc::read_only_file_system, ec);
+    EXPECT_EQ(0, ver);
   };
   boost::asio::spawn(service, failure_cr, rethrow);
 
diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt
index 31e82944bf5..f2d1471e22e 100644
--- a/src/test/osd/CMakeLists.txt
+++ b/src/test/osd/CMakeLists.txt
@@ -19,6 +19,14 @@ install(TARGETS
   ceph_test_rados
   DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+add_executable(ceph_test_rados_io_sequence
+  ${CMAKE_CURRENT_SOURCE_DIR}/ceph_test_rados_io_sequence.cc)
+target_link_libraries(ceph_test_rados_io_sequence
+  librados global object_io_exerciser)
+install(TARGETS
+  ceph_test_rados_io_sequence
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 # test_stale_read
 add_executable(ceph_test_osd_stale_read
   ceph_test_osd_stale_read.cc
diff --git a/src/test/osd/TestECBackend.cc b/src/test/osd/TestECBackend.cc
index d28d428fc06..f93ed7ff67a 100644
--- a/src/test/osd/TestECBackend.cc
+++ b/src/test/osd/TestECBackend.cc
@@ -230,3 +230,28 @@ TEST(ECCommon, get_min_want_to_read_shards)
     ASSERT_TRUE(want_to_read == (std::set<int>{0, 1, 2, 3}));
   }
 }
+
+TEST(ECCommon, get_min_want_to_read_shards_bug67087)
+{
+  const uint64_t swidth = 4096;
+  const uint64_t ssize = 4;
+
+  ECUtil::stripe_info_t s(ssize, swidth);
+  ASSERT_EQ(s.get_stripe_width(), swidth);
+  ASSERT_EQ(s.get_chunk_size(), 1024);
+
+  const std::vector<int> chunk_mapping = {}; // no remapping
+
+  std::set<int> want_to_read;
+
+  // multitple calls with the same want_to_read can happen during
+  // multi-region reads.
+  {
+    ECCommon::ReadPipeline::get_min_want_to_read_shards(
+      512, 512, s, chunk_mapping, &want_to_read);
+    ASSERT_EQ(want_to_read, std::set<int>{0});
+    ECCommon::ReadPipeline::get_min_want_to_read_shards(
+      512+16*1024, 512, s, chunk_mapping, &want_to_read);
+    ASSERT_EQ(want_to_read, std::set<int>{0});
+  }
+}
diff --git a/src/test/osd/ceph_test_rados_io_sequence.cc b/src/test/osd/ceph_test_rados_io_sequence.cc
new file mode 100644
index 00000000000..5e340c5c9c5
--- /dev/null
+++ b/src/test/osd/ceph_test_rados_io_sequence.cc
@@ -0,0 +1,822 @@
+#include "ceph_test_rados_io_sequence.h"
+
+#include <iostream>
+#include <vector>
+
+#include <boost/asio/io_context.hpp>
+
+#include "include/random.h"
+
+#include "librados/librados_asio.h"
+#include "common/ceph_argparse.h"
+#include "include/interval_set.h"
+#include "global/global_init.h"
+#include "global/global_context.h"
+#include "common/Thread.h"
+#include "common/debug.h"
+#include "common/dout.h"
+#include "common/split.h"
+
+#include "common/io_exerciser/DataGenerator.h"
+#include "common/io_exerciser/Model.h"
+#include "common/io_exerciser/ObjectModel.h"
+#include "common/io_exerciser/RadosIo.h"
+#include "common/io_exerciser/IoOp.h"
+#include "common/io_exerciser/IoSequence.h"
+
+#define dout_subsys ceph_subsys_rados
+#define dout_context g_ceph_context
+
+namespace {
+  struct Size {};
+  void validate(boost::any& v, const std::vector<std::string>& values,
+                Size *target_type, int) {
+    po::validators::check_first_occurrence(v);
+    const std::string &s = po::validators::get_single_string(values);
+
+    std::string parse_error;
+    uint64_t size = strict_iecstrtoll(s, &parse_error);
+    if (!parse_error.empty()) {
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+    v = boost::any(size);
+  }
+
+  struct Pair {};
+  void validate(boost::any& v, const std::vector<std::string>& values,
+                Pair *target_type, int) {
+    po::validators::check_first_occurrence(v);
+    const std::string &s = po::validators::get_single_string(values);
+    auto part = ceph::split(s).begin();
+    std::string parse_error;
+    int first = strict_iecstrtoll(*part++, &parse_error);
+    int second = strict_iecstrtoll(*part, &parse_error);
+    if (!parse_error.empty()) {
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+    v = boost::any(std::pair<int,int>{first,second});
+  }
+
+  struct PluginString {};
+  void validate(boost::any& v, const std::vector<std::string>& values,
+                PluginString *target_type, int) {
+    po::validators::check_first_occurrence(v);
+    const std::string &s = po::validators::get_single_string(values);
+
+    const std::string_view* pluginIt = std::find(
+          ceph::io_sequence::tester::pluginChoices.begin(),
+          ceph::io_sequence::tester::pluginChoices.end(), 
+          s
+    );
+    if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt)
+    {
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+
+    v = boost::any(*pluginIt);
+  }
+
+  constexpr std::string_view usage[] = {
+    "Basic usage:",
+    "",
+    "ceph_test_rados_io_sequence",
+    "\t Test I/O to a single object using default settings. Good for",
+    "\t testing boundary conditions",
+    "",
+    "ceph_test_rados_io_sequence --parallel <n>",
+    "\t Run parallel test to multiple objects. First object is tested with",
+    "\t default settings, other objects are tested with random settings",
+    "",
+    "Advanced usage:",
+    "",
+    "ceph_test_rados_io_sequence --blocksize <b> --km <k,m> --plugin <p>",
+    "                            --objectsize <min,max> --threads <t>",
+    "ceph_test_rados_io_sequence --blocksize <b> --pool <p> --object <oid>",
+    "                            --objectsize <min,max> --threads <t>",
+    "\tCustomize the test, if a pool is specified then it defines the",
+    "\tReplica/EC configuration",
+    "",
+    "ceph_test_rados_io_sequence --listsequence",
+    "\t Display list of supported I/O sequences",
+    "",
+    "ceph_test_rados_io_sequence --dryrun --sequence <n>",
+    "\t Show I/O that will be generated for a sequence, validate",
+    "\t seqeunce has correct I/O barriers to restrict concurrency",
+    "",
+    "ceph_test_rados_io_sequence --seed <seed>",
+    "\t Repeat a previous test with the same random numbers (seed is",
+    "\t displayed at start of test), if threads = 1 then this will produce",
+    "\t the exact same sequence of I/O, if threads > 1 then I/Os are issued",
+    "\t in parallel so ordering might be slightly different",
+    "",
+    "ceph_test_rados_io_sequence --sequence <n> --seqseed <n>",
+    "\t Repeat a sequence from a previous test with the same random",
+    "\t numbers (seqseed is displayed at start of sequence)",
+    "",
+    "ceph_test_rados_io_sequence --pool <p> --object <oid> --interactive",
+    "\t Execute sequence of I/O commands from stdin. Offset and length",
+    "\t are specified with unit of blocksize. Supported commands:",
+    "\t\t create <len>",
+    "\t\t remove",
+    "\t\t read|write <off> <len>",
+    "\t\t read2|write2 <off> <len> <off> <len>",
+    "\t\t read3|write3 <off> <len> <off> <len> <off> <len>",
+    "\t\t done"
+  };
+
+  po::options_description get_options_description()
+  {
+    po::options_description desc("ceph_test_rados_io options");
+    desc.add_options()
+      ("help,h",
+        "show help message")
+      ("listsequence,l",
+        "show list of sequences")
+      ("dryrun,d",
+        "test sequence, do not issue any I/O")
+      ("verbose",
+        "more verbose output during test")
+      ("sequence,s", po::value<int>(),
+        "test specified sequence")
+      ("seed", po::value<int>(),
+        "seed for whole test")
+      ("seqseed", po::value<int>(),
+        "seed for sequence")
+      ("blocksize,b", po::value<Size>(),
+        "block size (default 2048)")
+      ("chunksize,c", po::value<Size>(),
+        "chunk size (default 4096)")
+      ("pool,p", po::value<std::string>(),
+        "pool name")
+      ("object,o", po::value<std::string>()->default_value("test"),
+        "object name")
+      ("km", po::value<Pair>(),
+        "k,m EC pool profile (default 2,2)")
+      ("plugin", po::value<PluginString>(),
+        "EC plugin (isa or jerasure)")
+      ("objectsize", po::value<Pair>(),
+        "min,max object size in blocks (default 1,32)")
+      ("threads,t", po::value<int>(),
+        "number of threads of I/O per object (default 1)")
+      ("parallel,p", po::value<int>()->default_value(1),
+        "number of objects to exercise in parallel")
+      ("interactive",
+        "interactive mode, execute IO commands from stdin");
+
+    return desc;
+  }
+
+  int parse_io_seq_options(
+      po::variables_map& vm,
+      int argc,
+      char** argv)
+  {
+    std::vector<std::string> unrecognized_options;
+    try {
+      po::options_description desc = get_options_description();
+
+      auto parsed = po::command_line_parser(argc, argv)
+        .options(desc)
+        .allow_unregistered()
+        .run();
+      po::store(parsed, vm);
+      po::notify(vm);
+      unrecognized_options = po::collect_unrecognized(parsed.options,
+                                                      po::include_positional);
+
+      if (!unrecognized_options.empty())
+      {
+        std::stringstream ss;
+        ss << "Unrecognised command options supplied: ";
+        while (unrecognized_options.size() > 1)
+        {
+          ss << unrecognized_options.back().c_str() << ", ";
+          unrecognized_options.pop_back();
+        }
+        ss << unrecognized_options.back();
+        dout(0) << ss.str() << dendl;
+        return 1;
+      }
+    } catch(const po::error& e) {
+      std::cerr << "error: " << e.what() << std::endl;
+      return 1;
+    }
+
+    return 0;
+  }
+}
+
+template <typename T, int N, const std::array<T, N>& Ts>
+ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>
+  ::ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
+                          po::variables_map vm,
+                          const std::string& option_name,
+                          bool set_forced,
+                          bool select_first)
+  : rng(rng),
+    option_name(option_name) {
+  if (set_forced && vm.count(option_name)) {
+    force_value = vm[option_name].as<T>();
+  }
+  if (select_first) {
+    ceph_assert(choices.size() > 0);
+    first_value = choices[0];
+  }
+}
+
+template <typename T, int N, const std::array<T, N>& Ts>
+bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced()
+{
+  return force_value.has_value();
+}
+
+template <typename T, int N, const std::array<T, N>& Ts>
+const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose()
+{
+  if (force_value.has_value()) {
+    return *force_value;
+  } else if (first_value.has_value()) {
+    return *std::exchange(first_value, std::nullopt);
+  } else {
+    return choices[rng(N-1)];
+  }
+}
+
+
+
+ceph::io_sequence::tester::SelectObjectSize::SelectObjectSize(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "objectsize", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectBlockSize::SelectBlockSize(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "blocksize", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectNumThreads::SelectNumThreads(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "threads", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectSeqRange::SelectSeqRange(
+    ceph::util::random_number_generator<int>& rng,
+    po::variables_map vm) 
+  : ProgramOptionSelector(rng, vm, "sequence", false, false)
+{
+  if (vm.count(option_name)) {
+    ceph::io_exerciser::Sequence s =
+      static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>());
+    if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN ||
+        s >= ceph::io_exerciser::Sequence::SEQUENCE_END) {
+      dout(0) << "Sequence argument out of range" << dendl;
+      throw po::validation_error(po::validation_error::invalid_option_value);
+    }
+    ceph::io_exerciser::Sequence e = s;
+    force_value = std::make_optional<std::pair<ceph::io_exerciser::Sequence,
+                                               ceph::io_exerciser::Sequence>>(
+                    std::make_pair(s, ++e));
+  }
+}
+
+const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence>
+  ceph::io_sequence::tester::SelectSeqRange::choose() {
+  if (force_value.has_value())
+  {
+    return *force_value;
+  } else {
+    return std::make_pair(ceph::io_exerciser::Sequence::SEQUENCE_BEGIN,
+                          ceph::io_exerciser::Sequence::SEQUENCE_END);
+  }
+}
+
+
+
+ceph::io_sequence::tester::SelectErasureKM::SelectErasureKM(
+  ceph::util::random_number_generator<int>& rng,
+  po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "km", true, true)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectErasurePlugin::SelectErasurePlugin(
+  ceph::util::random_number_generator<int>& rng,
+  po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "plugin", true, false)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+  : ProgramOptionSelector(rng, vm, "stripe_unit", true, false)
+{
+}
+
+
+
+ceph::io_sequence::tester::SelectECPool::SelectECPool(
+  ceph::util::random_number_generator<int>& rng,
+  po::variables_map vm,
+  librados::Rados& rados,
+  bool dry_run)
+  : ProgramOptionSelector(rng, vm, "pool", false, false),
+    rados(rados),
+    dry_run(dry_run),
+    skm(SelectErasureKM(rng, vm)),
+    spl(SelectErasurePlugin(rng, vm)),
+    scs(SelectErasureChunkSize(rng, vm))
+{
+  if (!skm.isForced()) {
+    if (vm.count("pool")) {
+      force_value = vm["pool"].as<std::string>();
+    }
+  }
+}
+
+const std::string ceph::io_sequence::tester::SelectECPool::choose()
+{
+  std::pair<int,int> value;
+  if (!skm.isForced() && force_value.has_value()) {
+    return *force_value;
+  } else {
+    value = skm.choose();
+  }
+  int k = value.first;
+  int m = value.second;
+
+  const std::string plugin = std::string(spl.choose());
+  const uint64_t chunk_size = scs.choose();
+
+  std::string pool_name = "ec_" + plugin +
+                          "_cs" + std::to_string(chunk_size) +
+                          "_k" + std::to_string(k) +
+                          "_m" + std::to_string(m);
+  if (!dry_run)
+  {
+    create_pool(rados, pool_name, plugin, chunk_size, k, m);
+  }
+  return pool_name;
+}
+
+void ceph::io_sequence::tester::SelectECPool::create_pool(
+  librados::Rados& rados,
+  const std::string& pool_name,
+  const std::string& plugin,
+  uint64_t chunk_size,
+  int k, int m)
+{
+  int rc;
+  bufferlist inbl, outbl;
+  std::string profile_create =
+    "{\"prefix\": \"osd erasure-code-profile set\", \
+    \"name\": \"testprofile-" + pool_name + "\", \
+    \"profile\": [ \"plugin=" + plugin + "\", \
+    \"k=" + std::to_string(k) + "\", \
+    \"m=" + std::to_string(m) + "\", \
+    \"stripe_unit=" + std::to_string(chunk_size) + "\", \
+    \"crush-failure-domain=osd\"]}";
+  rc = rados.mon_command(profile_create, inbl, &outbl, nullptr);
+  ceph_assert(rc == 0);
+  std::string cmdstr =
+    "{\"prefix\": \"osd pool create\", \
+    \"pool\": \"" + pool_name + "\", \
+    \"pool_type\": \"erasure\", \
+    \"pg_num\": 8, \
+    \"pgp_num\": 8, \
+    \"erasure_code_profile\": \"testprofile-" + pool_name + "\"}";
+  rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
+  ceph_assert(rc == 0);
+}
+
+
+
+ceph::io_sequence::tester::TestObject::TestObject( const std::string oid,
+                        librados::Rados& rados,
+                        boost::asio::io_context& asio,
+                        SelectBlockSize& sbs,
+                        SelectECPool& spo,
+                        SelectObjectSize& sos,
+                        SelectNumThreads& snt,
+                        SelectSeqRange& ssr,
+                        ceph::util::random_number_generator<int>& rng,
+                        ceph::mutex& lock,
+                        ceph::condition_variable& cond,
+                        bool dryrun,
+                        bool verbose,
+                        std::optional<int>  seqseed) :
+  rng(rng), verbose(verbose), seqseed(seqseed)
+{
+  if (dryrun) {
+    verbose = true;
+    exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(oid,
+                                                                        sbs.choose(),
+                                                                        rng());
+  } else {
+    const std::string pool = spo.choose();
+    int threads = snt.choose();
+    exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(rados,
+                                                                    asio,
+                                                                    pool,
+                                                                    oid,
+                                                                    sbs.choose(),
+                                                                    rng(),
+                                                                    threads,
+                                                                    lock,
+                                                                    cond);
+    dout(0) << "= " << oid << " pool=" << pool
+            << " threads=" << threads
+            << " blocksize=" << exerciser_model->get_block_size()
+            << " =" << dendl;
+  }
+  obj_size_range = sos.choose();
+  seq_range = ssr.choose();
+  curseq = seq_range.first;
+  seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
+                                                          obj_size_range,
+                                                          seqseed.value_or(rng()));
+  op = seq->next();
+  done = false;
+  dout(0) << "== " << exerciser_model->get_oid() << " "
+          << curseq << " "
+          << seq->get_name()
+          << " ==" <<dendl;
+}
+
+bool ceph::io_sequence::tester::TestObject::readyForIo()
+{
+  return exerciser_model->readyForIoOp(*op);
+}
+
+bool ceph::io_sequence::tester::TestObject::next()
+{
+  if (!done) {
+    if (verbose) {
+      dout(0) << exerciser_model->get_oid()
+              << " Step " << seq->get_step() << ": "
+              << op->to_string(exerciser_model->get_block_size()) << dendl;
+    } else {
+      dout(5) << exerciser_model->get_oid()
+              << " Step " << seq->get_step() << ": "
+              << op->to_string(exerciser_model->get_block_size()) << dendl;
+    }
+    exerciser_model->applyIoOp(*op);
+    if (op->done()) {
+      ++curseq;
+      if (curseq == seq_range.second) {
+        done = true;
+        dout(0) << exerciser_model->get_oid()
+                << " Number of IOs = " << exerciser_model->get_num_io()
+                << dendl;
+      } else {
+        seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
+                                                                obj_size_range,
+                                                                seqseed.value_or(rng()));
+        dout(0) << "== " << exerciser_model->get_oid() << " "
+                << curseq << " " << seq->get_name()
+                << " ==" <<dendl;
+        op = seq->next();
+      }
+    } else {
+      op = seq->next();
+    }
+  }
+  return done;
+}
+
+bool ceph::io_sequence::tester::TestObject::finished()
+{
+  return done;
+}
+
+int ceph::io_sequence::tester::TestObject::get_num_io()
+{
+  return exerciser_model->get_num_io();
+}
+
+ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm,
+                                                  librados::Rados& rados) :
+  rados(rados),
+  seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)),
+  rng(ceph::util::random_number_generator<int>(seed)),
+  sbs{rng, vm},
+  sos{rng, vm},
+  spo{rng, vm, rados, vm.contains("dryrun")},
+  snt{rng, vm},
+  ssr{rng, vm}
+{
+  dout(0) << "Test using seed " << seed << dendl;
+
+  verbose = vm.contains("verbose");
+  dryrun = vm.contains("dryrun");
+
+  seqseed = std::nullopt;
+  if (vm.contains("seqseed")) {
+    seqseed = vm["seqseed"].as<int>();
+  }
+  num_objects = vm["parallel"].as<int>();
+  object_name = vm["object"].as<std::string>();
+  interactive = vm.contains("interactive");
+
+  if (!dryrun)
+  {
+    guard.emplace(boost::asio::make_work_guard(asio));
+    thread = make_named_thread("io_thread",[&asio = asio] { asio.run(); });
+  }
+
+  show_help = vm.contains("help");
+  show_sequence = vm.contains("listsequence");
+}
+
+ceph::io_sequence::tester::TestRunner::~TestRunner()
+{
+  if (!dryrun) {
+    guard = std::nullopt;
+    asio.stop();
+    thread.join();
+    rados.shutdown();
+  }
+}
+
+void ceph::io_sequence::tester::TestRunner::help()
+{
+  std::cout << get_options_description() << std::endl;
+  for (auto line : usage) {
+    std::cout << line << std::endl;
+  }
+}
+
+void ceph::io_sequence::tester::TestRunner::list_sequence()
+{
+  // List seqeunces
+  std::pair<int,int> obj_size_range = sos.choose();
+  for (ceph::io_exerciser::Sequence s
+        = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN;
+        s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) {
+    std::unique_ptr<ceph::io_exerciser::IoSequence> seq =
+    ceph::io_exerciser::IoSequence::generate_sequence(s,
+                                                      obj_size_range,
+                                                      seqseed.value_or(rng()));
+    dout(0) << s << " " << seq->get_name() << dendl;
+  }
+}
+
+std::string ceph::io_sequence::tester::TestRunner::get_token()
+{
+  static std::string line;
+  static ceph::split split = ceph::split("");
+  static ceph::spliterator tokens;
+  while (line.empty() || tokens == split.end()) {
+    if (!std::getline(std::cin, line)) {
+      throw std::runtime_error("End of input");
+    }
+    split = ceph::split(line);
+    tokens = split.begin();
+  }
+  return std::string(*tokens++);
+}
+
+uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token()
+{
+  std::string parse_error;
+  std::string token = get_token();
+  uint64_t num = strict_iecstrtoll(token, &parse_error);
+  if (!parse_error.empty()) {
+    throw std::runtime_error("Invalid number "+token);
+  }
+  return num;
+}
+
+bool ceph::io_sequence::tester::TestRunner::run_test()
+{
+  if (show_help)
+  {
+    help();
+    return true;
+  }
+  else if (show_sequence)
+  {
+    list_sequence();
+    return true;
+  }
+  else if (interactive)
+  {
+    return run_interactive_test();
+  }
+  else
+  {
+    return run_automated_test();
+  }
+}
+
+bool ceph::io_sequence::tester::TestRunner::run_interactive_test()
+{
+  bool done = false;
+  std::unique_ptr<ceph::io_exerciser::IoOp> ioop;
+  std::unique_ptr<ceph::io_exerciser::Model> model;
+
+  if (dryrun) {
+    model = std::make_unique<ceph::io_exerciser::ObjectModel>(object_name,
+				                              sbs.choose(),
+				                              rng());
+  } else {
+    const std::string pool = spo.choose();
+    model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, asio, pool,
+                                                          object_name, sbs.choose(),
+                                                          rng(), 1, // 1 thread
+                                                          lock, cond);
+  }
+
+  while (!done) {
+    const std::string op = get_token();
+    if (!op.compare("done")  || !op.compare("q") || !op.compare("quit")) {
+      ioop = ceph::io_exerciser::IoOp::generate_done();
+    } else if (!op.compare("create")) {
+      ioop = ceph::io_exerciser::IoOp::generate_create(get_numeric_token());
+    } else if (!op.compare("remove") || !op.compare("delete")) {
+      ioop = ceph::io_exerciser::IoOp::generate_remove();
+    } else if (!op.compare("read")) {
+      uint64_t offset = get_numeric_token();
+      uint64_t length = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_read(offset, length);
+    } else if (!op.compare("read2")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_read2(offset1, length1,
+                                                      offset2, length2);
+    } else if (!op.compare("read3")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      uint64_t offset3 = get_numeric_token();
+      uint64_t length3 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_read3(offset1, length1,
+                                                      offset2, length2,
+				                      offset3, length3);
+    } else if (!op.compare("write")) {
+      uint64_t offset = get_numeric_token();
+      uint64_t length = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_write(offset, length);
+    } else if (!op.compare("write2")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_write2(offset1, length1,
+                                                       offset2, length2);
+    } else if (!op.compare("write3")) {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      uint64_t offset3 = get_numeric_token();
+      uint64_t length3 = get_numeric_token();
+      ioop = ceph::io_exerciser::IoOp::generate_write3(offset1, length1,
+                                                       offset2, length2,
+				                       offset3, length3);
+    } else {
+      throw std::runtime_error("Invalid operation "+op);
+    }
+    dout(0) << ioop->to_string(model->get_block_size()) << dendl;
+    model->applyIoOp(*ioop);
+    done = ioop->done();
+    if (!done) {
+      ioop = ceph::io_exerciser::IoOp::generate_barrier();
+      model->applyIoOp(*ioop);
+    }
+  }
+
+  return true;
+}
+
+bool ceph::io_sequence::tester::TestRunner::run_automated_test()
+{
+  // Create a test for each object
+  std::vector<std::shared_ptr<
+    ceph::io_sequence::tester::TestObject>> test_objects;
+
+  for (int obj = 0; obj < num_objects; obj++) {
+    std::string name;
+    if (obj == 0) {
+      name = object_name;
+    } else {
+      name = object_name + std::to_string(obj);
+    }
+    test_objects.push_back(
+      std::make_shared<ceph::io_sequence::tester::TestObject>(
+            name,
+            rados, asio,
+            sbs, spo, sos, snt, ssr,
+            rng, lock, cond,
+            dryrun, verbose,
+            seqseed
+      )
+    );
+  }
+  if (!dryrun) {
+    rados.wait_for_latest_osdmap();
+  }
+
+  // Main loop of test - while not all test objects have finished
+  // check to see if any are able to start a new I/O. If all test
+  // objects are waiting for I/O to complete then wait on a cond
+  // that is signalled each time an I/O completes
+
+  bool started_io = true;
+  bool need_wait = true;
+  while (started_io || need_wait) {
+    started_io = false;
+    need_wait = false;
+    for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
+      std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
+      if (!to->finished()) {
+	lock.lock();
+	bool ready = to->readyForIo();
+	lock.unlock();
+	if (ready)
+	{
+	  to->next();
+	  started_io = true;
+	} else {
+	  need_wait = true;
+	}
+      }
+    }
+    if (!started_io && need_wait) {
+      std::unique_lock l(lock);
+      // Recheck with lock incase anything has changed
+      for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
+        std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
+        if (!to->finished()) {
+          need_wait = !to->readyForIo();
+          if (!need_wait)
+          {
+            break;
+          }
+        }
+      }
+      need_wait = true;
+    }
+  }
+
+  int total_io = 0;
+  for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
+    std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
+    total_io += to->get_num_io();
+    ceph_assert(to->finished());
+  }
+  dout(0) << "Total number of IOs = " << total_io << dendl;
+
+  return true;
+}
+
+int main(int argc, char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  env_to_vec(args);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(cct.get());
+
+  po::variables_map vm;
+  int rc = parse_io_seq_options(vm, argc, argv);
+  if (rc != 0)
+  {
+    return rc;
+  }
+
+  librados::Rados rados;
+  if (!vm.contains("dryrun")) {
+    rc = rados.init_with_context(g_ceph_context);
+    ceph_assert(rc == 0);
+    rc = rados.connect();
+    ceph_assert(rc == 0);
+  }
+
+  std::unique_ptr<ceph::io_sequence::tester::TestRunner> runner;
+  try {
+    runner = std::make_unique<ceph::io_sequence::tester::TestRunner>(vm, rados);
+  } catch(const po::error& e) {
+    return 1;
+  }
+  runner->run_test();
+
+  return 0;
+}
diff --git a/src/test/osd/ceph_test_rados_io_sequence.h b/src/test/osd/ceph_test_rados_io_sequence.h
new file mode 100644
index 00000000000..4f77c940274
--- /dev/null
+++ b/src/test/osd/ceph_test_rados_io_sequence.h
@@ -0,0 +1,343 @@
+#include <utility>
+
+#include "include/random.h"
+
+#include "global/global_init.h"
+#include "global/global_context.h"
+
+#include "common/io_exerciser/IoOp.h"
+#include "common/io_exerciser/IoSequence.h"
+#include "common/io_exerciser/Model.h"
+
+#include "librados/librados_asio.h"
+
+#include <boost/program_options.hpp>
+
+/* Overview
+ *
+ * class ProgramOptionSelector
+ *   Base class for selector objects below with common code for 
+ *   selecting options
+ * 
+ * class SelectObjectSize
+ *   Selects min and max object sizes for a test
+ *
+ * class SelectErasureKM
+ *   Selects an EC k and m value for a test
+ * 
+ * class SelectErasurePlugin
+ *   Selects an plugin for a test
+ * 
+ * class SelectECPool
+ *   Selects an EC pool (plugin,k and m) for a test. Also creates the
+ *   pool as well.
+ *
+ * class SelectBlockSize
+ *   Selects a block size for a test
+ *
+ * class SelectNumThreads
+ *   Selects number of threads for a test
+ *
+ * class SelectSeqRange
+ *   Selects a sequence or range of sequences for a test
+ *
+ * class TestObject
+ *   Runs a test against an object, generating IOSequence
+ *   and applying them to an IoExerciser
+ *
+ * main
+ *   Run sequences of I/O with data integrity checking to
+ *   one or more objects in parallel. Without arguments
+ *   runs a default configuration against one object.
+ *   Command arguments can select alternative
+ *   configurations. Alternatively running against
+ *   multiple objects with --objects <n> will select a
+ *   random configuration for all but the first object.
+ */
+
+namespace po = boost::program_options;
+
+namespace ceph
+{
+  namespace io_sequence::tester
+  {
+    // Choices for min and max object size
+    inline constexpr size_t objectSizeSize = 10;
+    inline constexpr std::array<std::pair<int,int>,objectSizeSize> 
+                        objectSizeChoices = {{
+      {1,32},  // Default - best for boundary checking
+      {12,14},
+      {28,30},
+      {36,38},
+      {42,44},
+      {52,54},
+      {66,68},
+      {72,74},
+      {83,83},
+      {97,97}
+    }};
+
+    // Choices for block size
+    inline constexpr int blockSizeSize = 5;
+    inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = {{
+      2048, // Default - test boundaries for EC 4K chunk size
+      512,
+      3767,
+      4096,
+      32768
+    }};
+
+    // Choices for number of threads
+    inline constexpr int threadArraySize = 4;
+    inline constexpr std::array<int, threadArraySize> threadCountChoices = {{
+      1, // Default
+      2,
+      4,
+      8
+    }};
+
+    // Choices for EC k+m profile
+    inline constexpr int kmSize = 6;
+    inline constexpr std::array<std::pair<int,int>, kmSize> kmChoices = {{
+      {2,2}, // Default - reasonable coverage
+      {2,1},
+      {2,3},
+      {3,2},
+      {4,2},
+      {5,1}
+    }};
+
+    // Choices for EC chunk size
+    inline constexpr int chunkSizeSize = 3;
+    inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = {{
+      4*1024,
+      64*1024,
+      256*1024
+    }};
+
+    // Choices for plugin
+    inline constexpr int pluginListSize = 2;
+    inline constexpr std::array<std::string_view,
+                                pluginListSize> pluginChoices = {{
+      "jerasure",
+      "isa"
+    }};
+
+    inline constexpr std::array<std::pair<ceph::io_exerciser::Sequence,
+                                          ceph::io_exerciser::Sequence>, 
+                                0> sequencePairs = {{}};
+
+    inline constexpr std::array<std::string, 0> poolChoices = {{}};
+
+    template <typename T, int N, const std::array<T, N>& Ts>
+    class ProgramOptionSelector
+    {
+    public:
+      ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
+                            po::variables_map vm,
+                            const std::string& option_name,
+                            bool set_forced,
+                            bool select_first
+                           );
+      virtual ~ProgramOptionSelector() = default;
+      bool isForced();
+      virtual const T choose();
+
+    protected:
+      ceph::util::random_number_generator<int>& rng;
+      static constexpr std::array<T, N> choices = Ts;
+
+      std::optional<T> force_value;
+      std::optional<T> first_value;
+
+      std::string option_name;
+    };
+
+    class SelectObjectSize
+      : public ProgramOptionSelector<std::pair<int, int>,
+                                     io_sequence::tester::objectSizeSize,
+                                     io_sequence::tester::objectSizeChoices>
+    {
+    public:
+      SelectObjectSize(ceph::util::random_number_generator<int>& rng,
+                      po::variables_map vm);  
+    };
+
+    class SelectBlockSize
+      : public ProgramOptionSelector<uint64_t, 
+                                     io_sequence::tester::blockSizeSize,
+                                     io_sequence::tester::blockSizeChoices>
+    {
+    public:
+      SelectBlockSize(ceph::util::random_number_generator<int>& rng,
+                      po::variables_map vm);
+    };
+
+    class SelectNumThreads
+      : public ProgramOptionSelector<int, 
+                                     io_sequence::tester::threadArraySize,
+                                     io_sequence::tester::threadCountChoices>
+    {
+    public:
+      SelectNumThreads(ceph::util::random_number_generator<int>& rng,
+                       po::variables_map vm);
+    };
+
+    class SelectSeqRange
+      : public ProgramOptionSelector<std::pair<ceph::io_exerciser::Sequence,
+                                               ceph::io_exerciser::Sequence>,
+                                     0, io_sequence::tester::sequencePairs>
+    {
+    public:
+      SelectSeqRange(ceph::util::random_number_generator<int>& rng,
+                     po::variables_map vm);
+
+      const std::pair<ceph::io_exerciser::Sequence,
+                      ceph::io_exerciser::Sequence> choose() override;
+    };
+
+    class SelectErasureKM
+      : public ProgramOptionSelector<std::pair<int,int>,
+                                     io_sequence::tester::kmSize,
+                                     io_sequence::tester::kmChoices>
+    {
+    public:
+      SelectErasureKM(ceph::util::random_number_generator<int>& rng,
+                      po::variables_map vm);
+    };
+
+    class SelectErasurePlugin
+      : public ProgramOptionSelector<std::string_view,
+                                     io_sequence::tester::pluginListSize,
+                                     io_sequence::tester::pluginChoices>
+        {
+    public:
+      SelectErasurePlugin(ceph::util::random_number_generator<int>& rng,
+                          po::variables_map vm);
+    };
+
+    class SelectErasureChunkSize 
+      : public ProgramOptionSelector<uint64_t, 
+                                     io_sequence::tester::chunkSizeSize,
+                                     io_sequence::tester::chunkSizeChoices>
+    {
+    public:
+      SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm);
+    };
+
+    class SelectECPool
+      : public ProgramOptionSelector<std::string,
+                                     0,
+                                     io_sequence::tester::poolChoices>
+    { 
+    public:
+      SelectECPool(ceph::util::random_number_generator<int>& rng,
+                   po::variables_map vm,
+                   librados::Rados& rados,
+                   bool dry_run);
+      const std::string choose() override;
+
+    private:
+      void create_pool(librados::Rados& rados,
+                       const std::string& pool_name,
+                       const std::string& plugin,
+                       uint64_t chunk_size,
+                       int k, int m);
+
+    protected:
+      librados::Rados& rados;
+      bool dry_run;
+      
+      SelectErasureKM skm;
+      SelectErasurePlugin spl;
+      SelectErasureChunkSize scs;
+    };
+
+    class TestObject
+    {
+    public:
+      TestObject( const std::string oid,
+                  librados::Rados& rados,
+                  boost::asio::io_context& asio,
+                  ceph::io_sequence::tester::SelectBlockSize& sbs,
+                  ceph::io_sequence::tester::SelectECPool& spl,
+                  ceph::io_sequence::tester::SelectObjectSize& sos,
+                  ceph::io_sequence::tester::SelectNumThreads& snt,
+                  ceph::io_sequence::tester::SelectSeqRange& ssr,
+                  ceph::util::random_number_generator<int>& rng,
+                  ceph::mutex& lock,
+                  ceph::condition_variable& cond,
+                  bool dryrun,
+                  bool verbose,
+                  std::optional<int>  seqseed);
+      
+      int get_num_io();
+      bool readyForIo();
+      bool next();
+      bool finished();
+
+    protected:
+      std::unique_ptr<ceph::io_exerciser::Model> exerciser_model;
+      std::pair<int,int> obj_size_range;
+      std::pair<ceph::io_exerciser::Sequence,
+                ceph::io_exerciser::Sequence> seq_range;
+      ceph::io_exerciser::Sequence curseq;
+      std::unique_ptr<ceph::io_exerciser::IoSequence> seq;
+      std::unique_ptr<ceph::io_exerciser::IoOp> op;
+      bool done;
+      ceph::util::random_number_generator<int>& rng;
+      bool verbose;
+      std::optional<int> seqseed;
+    };
+
+    class TestRunner
+    {
+    public:
+      TestRunner(po::variables_map& vm, librados::Rados& rados);
+      ~TestRunner();
+
+      bool run_test();
+
+    private:
+      librados::Rados& rados;
+      int seed;
+      ceph::util::random_number_generator<int> rng;
+
+      ceph::io_sequence::tester::SelectBlockSize sbs;
+      ceph::io_sequence::tester::SelectObjectSize sos;
+      ceph::io_sequence::tester::SelectECPool spo;
+      ceph::io_sequence::tester::SelectNumThreads snt;
+      ceph::io_sequence::tester::SelectSeqRange ssr;
+
+      boost::asio::io_context asio;
+      std::thread thread;
+      std::optional<boost::asio::executor_work_guard<
+                    boost::asio::io_context::executor_type>> guard;
+      ceph::mutex lock = ceph::make_mutex("RadosIo::lock");
+      ceph::condition_variable cond;
+
+      bool input_valid;
+
+      bool verbose;
+      bool dryrun;
+      std::optional<int> seqseed;
+      bool interactive;
+
+      bool show_sequence;
+      bool show_help;
+
+      int num_objects;
+      std::string object_name;
+
+      std::string get_token();
+      uint64_t get_numeric_token();
+
+      bool run_automated_test();
+
+      bool run_interactive_test();
+
+      void help();
+      void list_sequence();
+    };
+  }
+}
+\ No newline at end of file
diff --git a/src/test/rgw/bucket_notification/test_bn.py b/src/test/rgw/bucket_notification/test_bn.py
index 642ab6955a4..359990b3531 100644
--- a/src/test/rgw/bucket_notification/test_bn.py
+++ b/src/test/rgw/bucket_notification/test_bn.py
@@ -711,19 +711,16 @@ def test_ps_s3_topic_on_master():
     assert_equal(status, 404)
 
     # get the remaining 2 topics
-    result, status = topic_conf1.get_list()
-    assert_equal(status, 200)
-    assert_equal(len(result['ListTopicsResponse']['ListTopicsResult']['Topics']['member']), 2)
+    list_topics(2, tenant)
 
     # delete topics
-    result = topic_conf2.del_config()
+    status = topic_conf2.del_config()
     assert_equal(status, 200)
-    result = topic_conf3.del_config()
+    status = topic_conf3.del_config()
     assert_equal(status, 200)
 
     # get topic list, make sure it is empty
-    result, status = topic_conf1.get_list()
-    assert_equal(result['ListTopicsResponse']['ListTopicsResult']['Topics'], None)
+    list_topics(0, tenant)
 
 
 @attr('basic_test')
diff --git a/src/test/rgw/test_rgw_lua.cc b/src/test/rgw/test_rgw_lua.cc
index b2e11e442a2..ad923023a6d 100644
--- a/src/test/rgw/test_rgw_lua.cc
+++ b/src/test/rgw/test_rgw_lua.cc
@@ -9,6 +9,7 @@
 #include "rgw_lua_background.h"
 #include "rgw_lua_data_filter.h"
 #include "rgw_sal_config.h"
+#include "rgw_perf_counters.h"
 
 using namespace std;
 using namespace rgw;
@@ -184,9 +185,51 @@ inline std::unique_ptr<sal::RadosStore> make_store() {
   return std::make_unique<StoreBundle>(std::move(context_pool));
 };
 
+class TestLuaManager : public rgw::sal::StoreLuaManager {
+  public:
+    std::string lua_script;
+    unsigned read_time = 0;
+    TestLuaManager() {
+      rgw_perf_start(g_cct);
+    }
+    int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override {
+      std::this_thread::sleep_for(std::chrono::seconds(read_time));
+      script = lua_script;
+      return 0;
+    }
+    int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override {
+      return 0;
+    }
+    int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override {
+      return 0;
+    }
+    int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override {
+      return 0;
+    }
+    int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override {
+      return 0;
+    }
+    int reload_packages(const DoutPrefixProvider* dpp, optional_yield y) override {
+      return 0;
+    }
+    ~TestLuaManager() {
+      rgw_perf_stop(g_cct);
+    }
+};
+
+void set_script(rgw::sal::LuaManager* manager, const std::string& script) {
+  static_cast<TestLuaManager*>(manager)->lua_script = script;
+}
+void set_read_time(rgw::sal::LuaManager* manager, unsigned read_time) {
+  static_cast<TestLuaManager*>(manager)->read_time = read_time;
+}
+
 #define DEFINE_REQ_STATE RGWProcessEnv pe; \
   auto store = make_store();                   \
-  pe.lua.manager = store->get_lua_manager(""); \
+  pe.lua.manager = std::make_unique<TestLuaManager>(); \
   RGWEnv e; \
   req_state s(g_cct, pe, &e, 0);
 
@@ -850,24 +893,12 @@ TEST(TestRGWLua, OpsLog)
 }
 
 class TestBackground : public rgw::lua::Background {
-  const unsigned read_time;
-
-protected:
-  int read_script() override {
-    // don't read the object from the store
-    std::this_thread::sleep_for(std::chrono::seconds(read_time));
-    return 0;
-  }
-
 public:
-  TestBackground(sal::RadosStore* store, const std::string& script, rgw::sal::LuaManager* manager, unsigned read_time = 0) : 
+  TestBackground(sal::RadosStore* store, rgw::sal::LuaManager* manager) : 
     rgw::lua::Background(store, 
         g_cct, 
         manager,
-        1 /* run every second */),
-    read_time(read_time) {
-      // the script is passed in the constructor
-      rgw_script = script;
+        1 /* run every second */) {
     }
 
   ~TestBackground() override {
@@ -878,20 +909,19 @@ public:
 TEST(TestRGWLuaBackground, Start)
 {
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
+  auto manager = std::make_unique<TestLuaManager>();
   {
     // ctr and dtor without running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
   }
   {
     // ctr and dtor with running
-    TestBackground lua_background(store.get(), "", manager.get());
+    TestBackground lua_background(store.get(), manager.get());
     lua_background.start();
   }
 }
 
-
-constexpr auto wait_time = std::chrono::seconds(3);
+constexpr auto wait_time = std::chrono::milliseconds(100);
 
 template<typename T>
 const T& get_table_value(const TestBackground& b, const std::string& index) {
@@ -903,6 +933,15 @@ const T& get_table_value(const TestBackground& b, const std::string& index) {
   }
 }
 
+#define WAIT_FOR_BACKGROUND \
+{ \
+  unsigned max_tries = 100; \
+  do { \
+    std::this_thread::sleep_for(wait_time); \
+    --max_tries; \
+  } while (perfcounter->get(l_rgw_lua_script_ok) + perfcounter->get(l_rgw_lua_script_fail) == 0 && max_tries > 0); \
+}
+
 TEST(TestRGWLuaBackground, Script)
 {
   const std::string script = R"(
@@ -912,10 +951,11 @@ TEST(TestRGWLuaBackground, Script)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -928,9 +968,10 @@ TEST(TestRGWLuaBackground, RequestScript)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), background_script, pe.lua.manager.get());
+  set_script(pe.lua.manager.get(), background_script);
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
 
   const std::string request_script = R"(
     local key = "hello"
@@ -947,8 +988,9 @@ TEST(TestRGWLuaBackground, RequestScript)
   ASSERT_EQ(rc, 0);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from request");
   // now we resume and let the background set the value
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "from background");
 }
 
@@ -965,14 +1007,16 @@ TEST(TestRGWLuaBackground, Pause)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -991,15 +1035,17 @@ TEST(TestRGWLuaBackground, PauseWhileReading)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get(), 2);
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  set_read_time(manager.get(), 2);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  constexpr auto long_wait_time = std::chrono::seconds(6);
-  std::this_thread::sleep_for(long_wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(long_wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // one execution might occur after pause
   EXPECT_TRUE(value_len + 1 >= get_table_value<std::string>(lua_background, "hello").size());
 }
@@ -1013,14 +1059,16 @@ TEST(TestRGWLuaBackground, ReadWhilePaused)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.pause();
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "");
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   EXPECT_EQ(get_table_value<std::string>(lua_background, "hello"), "world");
 }
 
@@ -1037,18 +1085,21 @@ TEST(TestRGWLuaBackground, PauseResume)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.pause();
-  std::this_thread::sleep_for(wait_time);
+  // make sure no execution occurs
+  std::this_thread::sleep_for(wait_time*10);
   // no change in len
   EXPECT_EQ(value_len, get_table_value<std::string>(lua_background, "hello").size());
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.resume(store.get());
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1066,18 +1117,19 @@ TEST(TestRGWLuaBackground, MultipleStarts)
   )";
 
   auto store = make_store();
-  auto manager = store->get_lua_manager("");
-  TestBackground lua_background(store.get(), script, manager.get());
+  auto manager = std::make_unique<TestLuaManager>();
+  set_script(manager.get(), script);
+  TestBackground lua_background(store.get(), manager.get());
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   const auto value_len = get_table_value<std::string>(lua_background, "hello").size();
   EXPECT_GT(value_len, 0);
   lua_background.start();
   lua_background.shutdown();
   lua_background.shutdown();
-  std::this_thread::sleep_for(wait_time);
+  perfcounter->set(l_rgw_lua_script_ok, 0);
   lua_background.start();
-  std::this_thread::sleep_for(wait_time);
+  WAIT_FOR_BACKGROUND;
   // should be a change in len
   EXPECT_GT(get_table_value<std::string>(lua_background, "hello").size(), value_len);
 }
@@ -1085,7 +1137,7 @@ TEST(TestRGWLuaBackground, MultipleStarts)
 TEST(TestRGWLuaBackground, TableValues)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1107,7 +1159,7 @@ TEST(TestRGWLuaBackground, TableValues)
 TEST(TestRGWLuaBackground, TablePersist)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1137,7 +1189,7 @@ TEST(TestRGWLuaBackground, TablePersist)
 TEST(TestRGWLuaBackground, TableValuesFromRequest)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1165,7 +1217,7 @@ TEST(TestRGWLuaBackground, TableValuesFromRequest)
 TEST(TestRGWLuaBackground, TableInvalidValue)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   lua_background.start();
 
   const std::string request_script = R"(
@@ -1191,7 +1243,7 @@ TEST(TestRGWLuaBackground, TableInvalidValue)
 TEST(TestRGWLuaBackground, TableErase)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     RGW["size"] = 0
@@ -1229,7 +1281,7 @@ TEST(TestRGWLuaBackground, TableErase)
 TEST(TestRGWLuaBackground, TableIterate)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = "string value"
@@ -1256,7 +1308,7 @@ TEST(TestRGWLuaBackground, TableIterate)
 TEST(TestRGWLuaBackground, TableIterateWrite)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["a"] = 1
@@ -1286,7 +1338,7 @@ TEST(TestRGWLuaBackground, TableIterateWrite)
 TEST(TestRGWLuaBackground, TableIncrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1306,7 +1358,7 @@ TEST(TestRGWLuaBackground, TableIncrement)
 TEST(TestRGWLuaBackground, TableIncrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1328,7 +1380,7 @@ TEST(TestRGWLuaBackground, TableIncrementBy)
 TEST(TestRGWLuaBackground, TableDecrement)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1348,7 +1400,7 @@ TEST(TestRGWLuaBackground, TableDecrement)
 TEST(TestRGWLuaBackground, TableDecrementBy)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   const std::string request_script = R"(
     RGW["key1"] = 42
@@ -1370,7 +1422,7 @@ TEST(TestRGWLuaBackground, TableDecrementBy)
 TEST(TestRGWLuaBackground, TableIncrementValueError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- cannot increment string values
@@ -1405,7 +1457,7 @@ TEST(TestRGWLuaBackground, TableIncrementValueError)
 TEST(TestRGWLuaBackground, TableIncrementError)
 {
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
 
   std::string request_script = R"(
     -- missing argument
@@ -1494,7 +1546,7 @@ TEST(TestRGWLua, Data)
   )";
 
   DEFINE_REQ_STATE;
-  TestBackground lua_background(store.get(), "", pe.lua.manager.get());
+  TestBackground lua_background(store.get(), pe.lua.manager.get());
   s.host_id = "foo";
   pe.lua.background = &lua_background;
   lua::RGWObjFilter filter(&s, script);
diff --git a/src/test/strtol.cc b/src/test/strtol.cc
index ec3f6715b8e..aac52c6627f 100644
--- a/src/test/strtol.cc
+++ b/src/test/strtol.cc
@@ -184,6 +184,18 @@ TEST(IECStrToLL, WithUnits) {
   units["Ti"] = 40;
   units["Pi"] = 50;
   units["Ei"] = 60;
+  units["KB"] = 10;
+  units["MB"] = 20;
+  units["GB"] = 30;
+  units["TB"] = 40;
+  units["PB"] = 50;
+  units["EB"] = 60;
+  units["KiB"] = 10;
+  units["MiB"] = 20;
+  units["GiB"] = 30;
+  units["TiB"] = 40;
+  units["PiB"] = 50;
+  units["EiB"] = 60;
 
   for (std::map<std::string,int>::iterator p = units.begin();
        p != units.end(); ++p) {
@@ -259,6 +271,21 @@ TEST(StrictIECCast, Error) {
   }
   {
     std::string err;
+    (void)strict_iec_cast<int64_t>("1GT", &err);
+    ASSERT_NE(err, "");
+  }
+  {
+    std::string err;
+    (void)strict_iec_cast<int64_t>("1TG", &err);
+    ASSERT_NE(err, "");
+  }
+  {
+    std::string err;
+    (void)strict_iec_cast<int64_t>("1KD", &err);
+    ASSERT_NE(err, "");
+  }
+  {
+    std::string err;
     (void)strict_iec_cast<int64_t>("2E", &err);
     ASSERT_EQ(err, "");
   }
diff --git a/src/tools/ceph-dencoder/mds_types.h b/src/tools/ceph-dencoder/mds_types.h
index 91ba86be0d1..1272017c438 100644
--- a/src/tools/ceph-dencoder/mds_types.h
+++ b/src/tools/ceph-dencoder/mds_types.h
@@ -10,13 +10,22 @@ TYPE(SnapInfo)
 TYPE(snaplink_t)
 TYPE(sr_t)
 
-#include "mds/mdstypes.h"
+#include "mds/SimpleLock.h"
+TYPE_NOCOPY(SimpleLock)
+
+#include "mds/PurgeQueue.h"
+TYPE(PurgeItem)
+
+#include "mds/Anchor.h"
+TYPE(Anchor)
+
 #include "include/cephfs/types.h"
 TYPE(frag_info_t)
 TYPE(nest_info_t)
 TYPE(quota_info_t)
 TYPE(client_writeable_range_t)
 TYPE_FEATUREFUL(inode_t<std::allocator>)
+//TYPE(inode_t<std::allocator>)
 TYPE_FEATUREFUL(old_inode_t<std::allocator>)
 TYPE(fnode_t)
 TYPE(old_rstat_t)
@@ -31,6 +40,10 @@ TYPE(mds_load_t)
 TYPE(MDSCacheObjectInfo)
 TYPE(inode_backtrace_t)
 TYPE(inode_backpointer_t)
+TYPE(vinodeno_t)
+
+#include "include/cephfs/metrics/Types.h"
+TYPE(ClientMetricMessage)
 
 #include "mds/CInode.h"
 TYPE_FEATUREFUL(InodeStore)
@@ -40,12 +53,18 @@ TYPE_FEATUREFUL(InodeStoreBare)
 TYPE_FEATUREFUL(MDSMap)
 TYPE_FEATUREFUL(MDSMap::mds_info_t)
 
+#include "mds/flock.h"
+TYPE(ceph_lock_state_t)
+
 #include "mds/FSMap.h"
 //TYPE_FEATUREFUL(Filesystem)
 TYPE_FEATUREFUL(FSMap)
+TYPE(MirrorInfo)
 
 #include "mds/Capability.h"
 TYPE_NOCOPY(Capability)
+TYPE(Capability::Export)
+TYPE(Capability::Import)
 
 #include "mds/inode_backtrace.h"
 TYPE(inode_backpointer_t)
@@ -54,8 +73,11 @@ TYPE(inode_backtrace_t)
 #include "mds/InoTable.h"
 TYPE(InoTable)
 
+#include "mds/SessionMap.h"
+//TYPE_FEATUREFUL(SessionMapStore)
+
 #include "mds/SnapServer.h"
-TYPE_STRAYDATA(SnapServer)
+TYPE_FEATUREFUL(SnapServer)
 
 #include "mds/events/ECommitted.h"
 TYPE_FEATUREFUL_NOCOPY(ECommitted)
@@ -109,4 +131,22 @@ TYPE_FEATUREFUL_NOCOPY(ETableServer)
 
 #include "mds/events/EUpdate.h"
 TYPE_FEATUREFUL_NOCOPY(EUpdate)
+
+#include "mgr/MetricTypes.h"
+TYPE(MDSMetricPayload)
+TYPE(MetricReportMessage)
+TYPE(MDSConfigPayload)
+
+#include "mds/mdstypes.h"
+TYPE(metareqid_t)
+TYPE(feature_bitset_t)
+TYPE(dirfrag_t)
+TYPE(client_metadata_t)
+TYPE(MDSPerfMetricReport)
+TYPE(metric_spec_t)
+
+#include "messages/MMDSBeacon.h"
+TYPE(MDSHealthMetric)
+TYPE(MDSHealth)
+
 #endif // WITH_CEPHFS
diff --git a/src/tools/cephfs/shell/cephfs-shell b/src/tools/cephfs/shell/cephfs-shell
index 9df0f900604..f95a4afd057 100755
--- a/src/tools/cephfs/shell/cephfs-shell
+++ b/src/tools/cephfs/shell/cephfs-shell
@@ -15,14 +15,22 @@ import re
 import shlex
 import stat
 import errno
+import distro
 
 from cmd2 import Cmd
 from cmd2 import __version__ as cmd2_version
 from packaging.version import Version
 
+# DFLAG is used to override the checks done by cephfs-shell
+# for cmd2 versions due to weird behaviour of Ubuntu22.04 with
+# cmd2's version i.e. it always gets the version of cmd2 as
+# "0.0.0" instead of the actual cmd2 version.
+DFLAG = False
+if distro.name() == "Ubuntu" and distro.version() == "22.04":
+    DFLAG = True
 # XXX: In cmd2 versions < 1.0.1, we'll get SystemExit(2) instead of
 # Cmd2ArgparseError
-if Version(cmd2_version) >= Version("1.0.1"):
+if Version(cmd2_version) >= Version("1.0.1") or DFLAG is True:
     from cmd2.exceptions import Cmd2ArgparseError
 else:
     # HACK: so that we don't have check for version everywhere
@@ -1700,7 +1708,7 @@ def read_shell_conf(shell, shell_conf_file):
 
     sec = 'cephfs-shell'
     opts = []
-    if Version(cmd2_version) >= Version("0.10.0"):
+    if Version(cmd2_version) >= Version("0.10.0") or DFLAG is True:
         for attr in shell.settables.keys():
             opts.append(attr)
     else:
@@ -1768,7 +1776,7 @@ def manage_args():
     args.exe_and_quit = False    # Execute and quit, don't launch the shell.
 
     if args.batch:
-        if Version(cmd2_version) <= Version("0.9.13"):
+        if Version(cmd2_version) <= Version("0.9.13") and DFLAG is not True:
             args.commands = ['load ' + args.batch, ',quit']
         else:
             args.commands = ['run_script ' + args.batch, ',quit']
@@ -1813,7 +1821,7 @@ def execute_cmds_and_quit(args):
     # value to indicate whether the execution of the commands should stop, but
     # since 0.9.7 it returns the return value of do_* methods only if it's
     # not None. When it is None it returns False instead of None.
-    if Version(cmd2_version) <= Version("0.9.6"):
+    if Version(cmd2_version) <= Version("0.9.6") and DFLAG is not True:
         stop_exec_val = None
     else:
         stop_exec_val = False
diff --git a/src/tools/cephfs_mirror/PeerReplayer.cc b/src/tools/cephfs_mirror/PeerReplayer.cc
index b56ca9a2f1c..91117cf5f2b 100644
--- a/src/tools/cephfs_mirror/PeerReplayer.cc
+++ b/src/tools/cephfs_mirror/PeerReplayer.cc
@@ -1282,6 +1282,12 @@ int PeerReplayer::do_synchronize(const std::string &dir_root, const Snapshot &cu
       break;
     }
 
+    r = pre_sync_check_and_open_handles(dir_root, current, boost::none, &fh);
+    if (r < 0) {
+      dout(5) << ": cannot proceed with sync: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
     dout(20) << ": " << sync_stack.size() << " entries in stack" << dendl;
     std::string e_name;
     auto &entry = sync_stack.top();
@@ -1687,7 +1693,7 @@ int PeerReplayer::do_sync_snaps(const std::string &dir_root) {
   double duration = 0;
   for (; it != local_snap_map.end(); ++it) {
     if (m_perf_counters) {
-      start = std::chrono::duration_cast<std::chrono::milliseconds>(clock::now().time_since_epoch()).count();
+      start = std::chrono::duration_cast<std::chrono::seconds>(clock::now().time_since_epoch()).count();
       utime_t t;
       t.set_from_double(start);
       m_perf_counters->tset(l_cephfs_mirror_peer_replayer_last_synced_start, t);
@@ -1706,7 +1712,7 @@ int PeerReplayer::do_sync_snaps(const std::string &dir_root) {
     }
     if (m_perf_counters) {
       m_perf_counters->inc(l_cephfs_mirror_peer_replayer_snaps_synced);
-      end = std::chrono::duration_cast<std::chrono::milliseconds>(clock::now().time_since_epoch()).count();
+      end = std::chrono::duration_cast<std::chrono::seconds>(clock::now().time_since_epoch()).count();
       utime_t t;
       t.set_from_double(end);
       m_perf_counters->tset(l_cephfs_mirror_peer_replayer_last_synced_end, t);
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index da54d441e0c..9dfe9d36c0c 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -136,10 +136,11 @@ void usage(ostream& out)
 "   getomapval <obj-name> <key> [file] show the value for the specified key\n"
 "                                    in the object's object map\n"
 "   setomapval <obj-name> <key> <val | --input-file file>\n"
-"   rmomapkey <obj-name> <key>       Remove key from the object map of <obj-name>\n"
+"   rmomapkey <obj-name> <key>       remove key from the object map of <obj-name>\n"
 "   clearomap <obj-name> [obj-name2 obj-name3...] clear all the omap keys for the specified objects\n"
-"   getomapheader <obj-name> [file]  Dump the hexadecimal value of the object map header of <obj-name>\n"
-"   setomapheader <obj-name> <val>   Set the value of the object map header of <obj-name>\n"
+"   getomapheader <obj-name> [file]  dump the hexadecimal value of the object map header of <obj-name>\n"
+"   setomapheader <obj-name> <val | --input-file file>\n"
+"                                    set the value of the object map header of <obj-name>\n"
 "   watch <obj-name>                 add watcher on this object\n"
 "   notify <obj-name> <message>      notify watcher of this object with message\n"
 "   listwatchers <obj-name>          list the watchers of this object\n"
@@ -2844,17 +2845,33 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       ret = 0;
     }
   } else if (strcmp(nargs[0], "setomapheader") == 0) {
-    if (!pool_name || nargs.size() < (obj_name ? 2 : 3)) {
+    uint32_t min_args = 3;
+    if (obj_name) {
+      min_args--;
+    }
+    if (!input_file.empty()) {
+      min_args--;
+    }
+
+    if (!pool_name || nargs.size() < min_args) {
       usage(cerr);
       return 1;
     }
 
-    bufferlist bl;
     if (!obj_name) {
       obj_name = nargs[1];
-      bl.append(nargs[2]); // val
+    }
+
+    bufferlist bl;
+    if (!input_file.empty()) {
+      string err;
+      ret = bl.read_file(input_file.c_str(), &err);
+      if (ret < 0) {
+        cerr << "error reading file " << input_file.c_str() << ": " << err << std::endl;
+        return 1;
+      }
     } else {
-      bl.append(nargs[1]); // val
+      bl.append(nargs[min_args - 1]); // val
     }
     ret = io_ctx.omap_set_header(*obj_name, bl);
     if (ret < 0) {
diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc
index 71da0bd274a..95c8725aa33 100644
--- a/src/tools/rbd/Utils.cc
+++ b/src/tools/rbd/Utils.cc
@@ -478,10 +478,11 @@ int validate_snapshot_name(at::ArgumentModifier mod,
 int get_image_options(const boost::program_options::variables_map &vm,
 		      bool get_format, librbd::ImageOptions *opts) {
   uint64_t order = 0, stripe_unit = 0, stripe_count = 0, object_size = 0;
-  uint64_t features = 0, features_clear = 0;
+  uint64_t features = 0, features_set = 0, features_clear = 0;
   std::string data_pool;
   bool order_specified = true;
   bool features_specified = false;
+  bool features_set_specified = false;
   bool features_clear_specified = false;
   bool stripe_specified = false;
 
@@ -509,6 +510,13 @@ int get_image_options(const boost::program_options::variables_map &vm,
     stripe_specified = true;
   }
 
+  if (vm.count(at::IMAGE_MIRROR_IMAGE_MODE) &&
+      vm[at::IMAGE_MIRROR_IMAGE_MODE].as<librbd::mirror_image_mode_t>() ==
+      RBD_MIRROR_IMAGE_MODE_JOURNAL) {
+    features_set |= (RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_JOURNALING);
+    features_set_specified = true;
+  }
+
   if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) {
     if (features_specified) {
       features &= ~RBD_FEATURES_SINGLE_CLIENT;
@@ -581,6 +589,8 @@ int get_image_options(const boost::program_options::variables_map &vm,
     opts->set(RBD_IMAGE_OPTION_ORDER, order);
   if (features_specified)
     opts->set(RBD_IMAGE_OPTION_FEATURES, features);
+  if (features_set_specified)
+    opts->set(RBD_IMAGE_OPTION_FEATURES_SET, features_set);
   if (features_clear_specified) {
     opts->set(RBD_IMAGE_OPTION_FEATURES_CLEAR, features_clear);
   }