265 files changed, 8217 insertions, 3314 deletions
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 6ea402c9991..b4824a65584 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -1,5 +1,16 @@
 >=20.0.0
 
+* RGW: The User Account feature introduced in Squid provides first-class support for
+  IAM APIs and policy. Our preliminary STS support was instead based on tenants, and
+  exposed some IAM APIs to admins only. This tenant-level IAM functionality is now
+  deprecated in favor of accounts. While we'll continue to support the tenant feature
+  itself for namespace isolation, the following features will be removed no sooner
+  than the V release:
+    * tenant-level IAM APIs like CreateRole, PutRolePolicy and PutUserPolicy,
+    * use of tenant names instead of accounts in IAM policy documents,
+    * interpretation of IAM policy without cross-account policy evaluation,
+    * S3 API support for cross-tenant names such as `Bucket='tenant:bucketname'`
+
 * RBD: All Python APIs that produce timestamps now return "aware" `datetime`
   objects instead of "naive" ones (i.e. those including time zone information
   instead of those not including it).  All timestamps remain to be in UTC but
@@ -34,6 +45,8 @@
   (--yes-i-really-mean-it). This has been added as a precaution to tell the
   users that modifying "max_mds" may not help with troubleshooting or recovery
   effort. Instead, it might further destabilize the cluster.
+* RADOS: Added convenience function `librados::AioCompletion::cancel()` with
+  the same behavior as `librados::IoCtx::aio_cancel()`.
 
 * mgr/restful, mgr/zabbix: both modules, already deprecated since 2020, have been
   finally removed. They have not been actively maintenance in the last years,
diff --git a/README.md b/README.md
index a3fdfada623..f8fcf35e8b7 100644
--- a/README.md
+++ b/README.md
@@ -127,31 +127,7 @@ To build Ceph, follow this procedure:
 
        ninja install
 
-## Tips and Tricks
-
-   * Use "debug builds" only when needed. Debugging builds are helpful for
-     development, but they can slow down performance. Use
-     `-DCMAKE_BUILD_TYPE=Release` when debugging isn't necessary.
-   * Enable Selective Daemons when testing specific components. Don't start
-     unnecessary daemons.
-   * Preserve Existing Data skip cluster reinitialization between tests by
-     using the `-n` flag.
-
-##  Troubleshooting     
- 
-    * Cluster Fails to Start: Look for errors in the logs under the `out/`
-      directory.
-    * OSD Crashes: Check the OSD logs for errors.
-    * Cluster in a `Health Error` State: Run the `ceph status` command to
-      identify the issue.
-    * RocksDB Errors: Look for RocksDB-related errors in the OSD logs.
     
-    To manage a vstart cluster, stop daemons using `./stop.sh` and start them with ./vstart.sh --daemon osd.${ID} [--nodaemonize]. 
-    Restart by stopping and restarting daemons, ensuring no stale sockets. 
-    For RocksDB performance tracking, set `export ROCKSDB_PERF=true` and start the cluster with `./vstart.sh -n -d -x --bluestore`. 
-    Build with `vstart-base` using debug flags in cmake, compile, and deploy via `./vstart.sh -d -n --bluestore`.
-    To containerize, generate configurations with `vstart.sh`, and deploy with Docker, mapping directories and configuring the network.
-    Manage containers using `docker run`, `stop`, and `rm`. For detailed setups, consult the Ceph-Container repository.
 
  
 ### CMake Options
@@ -204,6 +180,36 @@ The diagnostic colors will be visible when the following command is run:
 Other available values for `DIAGNOSTICS_COLOR` are `auto` (default) and
 `never`.
 
+## Tips and Tricks
+
+   * Use "debug builds" only when needed. Debugging builds are helpful for
+     development, but they can slow down performance. Use
+     `-DCMAKE_BUILD_TYPE=Release` when debugging isn't necessary.
+   * Enable Selective Daemons when testing specific components. Don't start
+     unnecessary daemons.
+   * Preserve Existing Data skip cluster reinitialization between tests by
+     using the `-n` flag.
+   * To manage a vstart cluster, stop daemons using `./stop.sh` and start them
+     with `./vstart.sh --daemon osd.${ID} [--nodaemonize]`. 
+   * Restart the sockets by stopping and restarting the daemons associated with
+     them. This ensures that there are no stale sockets in the cluster.
+   * To track RocksDB performance, set `export ROCKSDB_PERF=true` and start
+     the cluster by using the command `./vstart.sh -n -d -x --bluestore`. 
+   * Build with `vstart-base` using debug flags in cmake, compile, and deploy
+     via `./vstart.sh -d -n --bluestore`.
+   * To containerize, generate configurations with `vstart.sh`, and deploy with
+     Docker, mapping directories and configuring the network.
+   * Manage containers using `docker run`, `stop`, and `rm`. For detailed
+     setups, consult the Ceph-Container repository.
+
+##  Troubleshooting     
+ 
+   * Cluster Fails to Start: Look for errors in the logs under the `out/`
+     directory.
+   * OSD Crashes: Check the OSD logs for errors.
+   * Cluster in a `Health Error` State: Run the `ceph status` command to
+     identify the issue.
+   * RocksDB Errors: Look for RocksDB-related errors in the OSD logs.
 
 ## Building a source tarball
 
diff --git a/debian/control b/debian/control
index ec04c2599cd..a7d2dbb4c3a 100644
--- a/debian/control
+++ b/debian/control
@@ -996,10 +996,10 @@ Package: librgw2
 Architecture: linux-any
 Section: libs
 Depends: librados2 (= ${binary:Version}),
+         liblua5.3-0,
          ${misc:Depends},
          ${shlibs:Depends},
-         liblua5.3-dev,
-         luarocks,
+Suggests: luarocks,
 Description: RADOS Gateway client library
  RADOS is a distributed object store used by the Ceph distributed
  storage system.  This package provides a REST gateway to the
diff --git a/doc/_ext/ceph_commands.py b/doc/_ext/ceph_commands.py
index 0697c71f0e1..d96eab08853 100644
--- a/doc/_ext/ceph_commands.py
+++ b/doc/_ext/ceph_commands.py
@@ -94,7 +94,7 @@ class CmdParam(object):
         self.goodchars = goodchars
         self.positional = positional != 'false'
 
-        assert who == None
+        assert who is None
 
     def help(self):
         advanced = []
diff --git a/doc/cephadm/services/osd.rst b/doc/cephadm/services/osd.rst
index 831bd238c79..90ebd86f897 100644
--- a/doc/cephadm/services/osd.rst
+++ b/doc/cephadm/services/osd.rst
@@ -198,6 +198,18 @@ There are a few ways to create new OSDs:
 
 .. warning:: When deploying new OSDs with ``cephadm``, ensure that the ``ceph-osd`` package is not already installed on the target host. If it is installed, conflicts may arise in the management and control of the OSD that may lead to errors or unexpected behavior.
 
+* OSDs created via ``ceph orch daemon add`` are by default not added to the orchestrator's OSD service, they get added to 'osd' service. To attach an OSD to a different, existing OSD service, issue a command of the following form:
+
+  .. prompt:: bash *
+
+    ceph orch osd set-spec-affinity <service_name> <osd_id(s)>
+
+  For example:
+
+  .. prompt:: bash #
+
+    ceph orch osd set-spec-affinity osd.default_drive_group 0 1
+
 Dry Run
 -------
 
diff --git a/doc/cephadm/services/rgw.rst b/doc/cephadm/services/rgw.rst
index ed0b149365a..3df8ed2fc56 100644
--- a/doc/cephadm/services/rgw.rst
+++ b/doc/cephadm/services/rgw.rst
@@ -173,6 +173,32 @@ Then apply this yaml document:
 Note the value of ``rgw_frontend_ssl_certificate`` is a literal string as
 indicated by a ``|`` character preserving newline characters.
 
+Disabling multisite sync traffic
+--------------------------------
+
+There is an RGW config option called ``rgw_run_sync_thread`` that tells the
+RGW daemon to not transmit multisite replication data. This is useful if you want
+that RGW daemon to be dedicated to I/O rather than multisite sync operations.
+The RGW spec file includes a setting ``disable_multisite_sync_traffic`` that when
+set to "True" will tell cephadm to set ``rgw_run_sync_thread`` to false for all
+RGW daemons deployed for that RGW service. For example
+
+.. code-block:: yaml
+
+    service_type: rgw
+    service_id: foo
+    placement:
+      label: rgw
+    spec:
+      rgw_realm: myrealm
+      rgw_zone: myzone
+      rgw_zonegroup: myzg
+      disable_multisite_sync_traffic: True
+
+.. note:: This will only stop the RGW daemon(s) from sending replication data.
+    The daemon can still receive replication data unless it has been removed
+    from the zonegroup and zone replication endpoints.
+
 Service specification
 ---------------------
 
diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst
index a8a991b01b0..630d29f1956 100644
--- a/doc/cephfs/index.rst
+++ b/doc/cephfs/index.rst
@@ -148,6 +148,7 @@ CephFS Concepts
     LazyIO <lazyio>
     Directory fragmentation <dirfrags>
     Multiple active MDS daemons <multimds>
+    Snapshots <snapshots>
 
 
 .. raw:: html
diff --git a/doc/cephfs/snapshots.rst b/doc/cephfs/snapshots.rst
new file mode 100644
index 00000000000..a60be96ed53
--- /dev/null
+++ b/doc/cephfs/snapshots.rst
@@ -0,0 +1,85 @@
+================
+CephFS Snapshots
+================
+
+CephFS snapshots create an immutable view of the file system at the point
+in time they are taken. CephFS support snapshots which is managed in a 
+special hidden subdirectory named ``.snap`` .Snapshots are created using
+``mkdir`` inside this directory.
+
+Snapshots can be exposed with a different name by changing the following client configurations.
+
+- ``snapdirname`` which is a mount option for kernel clients
+- ``client_snapdir`` which is a mount option for ceph-fuse.
+
+Snapshot Creation
+==================
+
+CephFS snapshot feature is enabled by default on new file systems. To enable 
+it on existing file systems, use the command below.
+
+.. code-block:: bash
+    
+    $ ceph fs set <fs_name> allow_new_snaps true
+
+When snapshots are enabled, all directories in CephFS will have a special ``.snap``
+directory. (You may configure a different name with the client snapdir setting if 
+you wish.)
+To create a CephFS snapshot, create a subdirectory under ``.snap`` with a name of 
+your choice. 
+For example, to create a snapshot on directory ``/file1/``, invoke ``mkdir /file1/.snap/snapshot-name``
+
+.. code-block:: bash
+
+    $ touch file1
+    $ cd .snap
+    $ mkdir my_snapshot
+
+Using snapshot to recover data
+===============================
+
+Snapshots can also be used to recover some deleted files.
+
+- ``create a file1 and create snapshot snap1``
+
+.. code-block:: bash
+
+    $ touch /mnt/cephfs/file1
+    $ cd .snap
+    $ mkdir snap1
+
+- ``create a file2 and create snapshot snap2``
+
+.. code-block:: bash
+
+    $ touch /mnt/cephfs/file2
+    $ cd .snap
+    $ mkdir snap2
+
+- ``delete file1 and create a new snapshot snap3``
+
+.. code-block:: bash
+
+    $ rm /mnt/cephfs/file1
+    $ cd .snap
+    $ mkdir snap3
+
+- ``recover file1 using snapshot snap2 using cp command``
+
+.. code-block:: bash
+
+    $ cd .snap
+    $ cd snap2
+    $ cp file1 /mnt/cephfs/
+
+Snapshot Deletion
+==================
+
+Snapshots are deleted by invoking ``rmdir`` on the ``.snap`` directory they are
+rooted in. (Attempts to delete a directory which roots the snapshots will fail; 
+you must delete the snapshots first.)
+
+.. code-block:: bash
+
+    $ cd .snap
+    $ rmdir my_snapshot
diff --git a/doc/conf.py b/doc/conf.py
index 4fdc9a53b75..5293ff1b212 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -76,7 +76,7 @@ html_show_sphinx = False
 html_static_path = ["_static"]
 html_sidebars = {
     '**': ['smarttoc.html', 'searchbox.html']
-    }
+}
 
 html_css_files = ['css/custom.css']
 
@@ -133,13 +133,23 @@ extensions = [
     'sphinxcontrib.mermaid',
     'sphinxcontrib.openapi',
     'sphinxcontrib.seqdiag',
-    ]
+]
 
 ditaa = shutil.which("ditaa")
 if ditaa is not None:
     # in case we don't have binfmt_misc enabled or jar is not registered
-    ditaa_args = ['-jar', ditaa]
-    ditaa = 'java'
+    _jar_paths = [
+        '/usr/share/ditaa/lib/ditaa.jar',  # Gentoo
+        '/usr/share/ditaa/ditaa.jar',  # deb
+        '/usr/share/java/ditaa.jar',  # rpm
+    ]
+    _jar_paths = [p for p in _jar_paths if os.path.exists(p)]
+    if _jar_paths:
+        ditaa = 'java'
+        ditaa_args = ['-jar', _jar_paths[0]]
+    else:
+        # keep ditaa from shutil.which
+        ditaa_args = []
     extensions += ['sphinxcontrib.ditaa']
 else:
     extensions += ['plantweb.directive']
diff --git a/doc/dev/crimson/crimson.rst b/doc/dev/crimson/crimson.rst
index f9582ec6c84..f6d59a057ff 100644
--- a/doc/dev/crimson/crimson.rst
+++ b/doc/dev/crimson/crimson.rst
@@ -43,6 +43,82 @@ use a Crimson build:
 You'll likely need to supply the ``--allow-mismatched-release`` flag to
 use a non-release branch.
 
+Configure Crimson with Bluestore
+================================
+
+As Bluestore is not a Crimson native `object store backend`_,
+deploying Crimson with Bluestore as the back end requires setting
+one of the two following configuration options:
+
+.. note::
+
+   #. These two options, along with ``crimson_alien_op_num_threads``,
+      can't be changed after deployment.
+   #. `vstart.sh`_ sets these options using the ``--crimson-smp`` flag.
+
+
+1) ``crimson_seastar_num_threads``
+
+   In order to allow easier cluster deployments, this option can be used
+   instead of setting the CPU mask manually for each OSD.
+
+   It's recommended to let the **number of OSDs on each host** multiplied by
+   ``crimson_seastar_num_threads`` to be less than the node's number of CPU
+   cores (``nproc``).
+
+   For example, for deploying two nodes with eight CPU cores and two OSDs each:
+
+   .. code-block:: yaml
+
+      conf:
+        # Global to all OSDs
+        osd:
+          crimson seastar num threads: 3
+
+   .. note::
+
+      #. For optimal performance ``crimson_seastar_cpu_cores`` should be set instead.
+
+2) ``crimson_seastar_cpu_cores`` and ``crimson_alien_thread_cpu_cores``.
+
+   Explicitly set the CPU core allocation for each ``crimson-osd``
+   and for the BlueStore back end. It's recommended for each set to be mutually exclusive.
+
+   For example, for deploying two nodes with eight CPU cores and two OSDs each:
+
+   .. code-block:: yaml
+
+      conf:
+        # Both nodes
+        osd:
+          crimson alien thread cpu cores: 6-7
+
+        # First node
+        osd.0:
+          crimson seastar cpu cores: 0-2
+        osd.1:
+          crimson seastar cpu cores: 3-5
+
+        # Second node
+        osd.2:
+          crimson seastar cpu cores: 0-2
+        osd.3:
+          crimson seastar cpu cores: 3-5
+
+   For a single node with eight node and three OSDs:
+
+   .. code-block:: yaml
+
+        conf:
+          osd:
+            crimson alien thread cpu cores: 6-7
+          osd.0:
+            crimson seastar cpu cores: 0-1
+          osd.1:
+            crimson seastar cpu cores: 2-3
+          osd.2:
+            crimson seastar cpu cores: 4-5
+
 Running Crimson
 ===============
 
@@ -106,7 +182,7 @@ The following options can be used with ``vstart.sh``.
     (as determined by `nproc`) will be assigned to the object store.
 
 ``--bluestore``
-    Use alienized BlueStore as the object store backend.
+    Use the alienized BlueStore as the object store backend. This is the default (see below section on the `object store backend`_ for more details)
 
 ``--cyanstore``
     Use CyanStore as the object store backend.
@@ -115,7 +191,7 @@ The following options can be used with ``vstart.sh``.
     Use the alienized MemStore as the object store backend.
 
 ``--seastore``
-    Use SeaStore as the back end object store. This is the default (see below section on the `object store backend`_ for more details)
+    Use SeaStore as the back end object store.
 
 ``--seastore-devs``
     Specify the block device used by SeaStore.
@@ -131,11 +207,20 @@ The following options can be used with ``vstart.sh``.
     Valid types include ``HDD``, ``SSD``(default), ``ZNS``, and ``RANDOM_BLOCK_SSD``
     Note secondary devices should not be faster than the main device.
 
+To start a cluster with a single Crimson node, run::
+
+  $  MGR=1 MON=1 OSD=1 MDS=0 RGW=0 ../src/vstart.sh \
+    --without-dashboard --bluestore --crimson \
+    --redirect-output
 
-To start a simple cluster with a single core Crimson OSD, run::
+Another SeaStore example::
 
-  $  MGR=1 MON=1 OSD=1 MDS=0 RGW=0 ../src/vstart.sh -n \
-    --without-dashboard --seastore --crimson
+  $  MGR=1 MON=1 OSD=1 MDS=0 RGW=0 ../src/vstart.sh -n -x \
+    --without-dashboard --seastore \
+    --crimson --redirect-output \
+    --seastore-devs /dev/sda \
+    --seastore-secondary-devs /dev/sdb \
+    --seastore-secondary-devs-type HDD
 
 Stop this ``vstart`` cluster by running::
 
@@ -154,7 +239,7 @@ They are:
 
 .. describe:: seastore
 
-   Seastore is the default Crimson backend and is still under active development.
+   Seastore is still under active development.
 
 The alienized object store backends are backed by a thread pool, which
 is a proxy of the alienstore adaptor running in Seastar. The proxy issues
@@ -169,82 +254,6 @@ managed by the Seastar framework. They are:
 
    The object store used by the classic ``ceph-osd``
 
-Configure Crimson with Bluestore
-================================
-
-As Bluestore is not a Crimson native `object store backend`_,
-deploying Crimson with Bluestore as the back end requires setting
-one of the two following configuration options:
-
-.. note::
-
-   #. These two options, along with ``crimson_alien_op_num_threads``,
-      can't be changed after deployment.
-   #. `vstart.sh`_ sets these options using the ``--crimson-smp`` flag.
-
-
-1) ``crimson_seastar_num_threads``
-
-   In order to allow easier cluster deployments, this option can be used
-   instead of setting the CPU mask manually for each OSD.
-
-   It's recommended to set the **number of OSDs on each host** multiplied by
-   ``crimson_seastar_num_threads`` to be less than the node's number of CPU
-   cores (``nproc``).
-
-   For example, for deploying two nodes with eight CPU cores and two OSDs each:
-
-   .. code-block:: yaml
-
-      conf:
-        # Global to all OSDs
-        osd:
-          crimson seastar num threads: 3
-
-   .. note::
-
-      #. For optimal performance ``crimson_seastar_cpu_cores`` should be set instead.
-
-2) ``crimson_seastar_cpu_cores`` and ``crimson_alien_thread_cpu_cores``.
-
-   Explicitly set the CPU core allocation for each ``crimson-osd``
-   and for the BlueStore back end. It's recommended for each set to be mutually exclusive.
-
-   For example, for deploying two nodes with eight CPU cores and two OSDs each:
-
-   .. code-block:: yaml
-
-      conf:
-        # Both nodes
-        osd:
-          crimson alien thread cpu cores: 6-7
-
-        # First node
-        osd.0:
-          crimson seastar cpu cores: 0-2
-        osd.1:
-          crimson seastar cpu cores: 3-5
-
-        # Second node
-        osd.2:
-          crimson seastar cpu cores: 0-2
-        osd.3:
-          crimson seastar cpu cores: 3-5
-
-   For a single node with eight node and three OSDs:
-
-   .. code-block:: yaml
-
-        conf:
-          osd:
-            crimson alien thread cpu cores: 6-7
-          osd.0:
-            crimson seastar cpu cores: 0-1
-          osd.1:
-            crimson seastar cpu cores: 2-3
-          osd.2:
-            crimson seastar cpu cores: 4-5
-
 daemonize
 ---------
 
diff --git a/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst b/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst
index 34dfd521eaa..6964012ef31 100644
--- a/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst
+++ b/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst
@@ -6,7 +6,8 @@ Integration Tests using Teuthology Workflow
 Infrastructure
 --------------
 
-Components:
+Components
+**********
 
 1. `ceph-ci`_: Clone of the main Ceph repository, used for triggering Jenkins 
    Ceph builds for development.
@@ -44,7 +45,27 @@ Components:
 Each Teuthology test *run* contains multiple test *jobs*. Each job runs in an 
 environment isolated from other jobs, on a different collection of test nodes.
 
-To test a change in Ceph, follow these steps:
+Workflow Overview
+*****************
+
+.. image:: workflow.png
+
+
+To test a change in Ceph, start by pushing a branch with your changes to the
+`ceph-ci`_ repository. This will automatically trigger the Jenkins process 
+to build Ceph binaries - the status of the build can be observed on `Shaman`_. 
+These built packages will be uploaded on `Chacra`_. 
+
+To schedule a Teuthology integration test against this new build, you will 
+need access to the Sepia lab. Once you have access, log into the Teuthology 
+machine and complete the one-time initial Teuthology setup required to run 
+Teuthology commands. After the setup, use the ``teuthology-suite`` command to schedule 
+a Teuthology run. In this command, use the ``-c <ceph-ci branch name>`` option to 
+specify your build. The results of your test can be observed on `Pulpito`_. 
+Log into a `developer playground machine`_ to review the Teuthology run's archive logs. 
+
+
+The rest of the document will explain these steps in detail: 
 
 1. Getting binaries - Build Ceph.
 2. Scheduling Test Run:
@@ -98,6 +119,31 @@ Ceph binaries must be built for your branch before you can use teuthology to run
 .. _the Chacra site: https://shaman.ceph.com/api/search/?status=ready&project=ceph
 
 
+Pushing to the ceph-ci repository
+*********************************
+
+Follow these steps to push to the ceph-ci repository. After pushing, a new build will 
+automatically be scheduled. 
+
+1. Add the ceph-ci repository as a remote to your local clone of the Ceph repository:
+
+   .. prompt:: bash $
+
+       git remote add ceph-ci git@github.com:ceph/ceph-ci.git  
+
+       $ git remote -v  
+       origin   git@github.com:ceph/ceph.git (fetch)  
+       origin   git@github.com:ceph/ceph.git (push)  
+       ceph-ci  git@github.com:ceph/ceph-ci.git (fetch)  
+       ceph-ci  git@github.com:ceph/ceph-ci.git (push)  
+
+2. Push your branch upstream by running a command of the following form:
+
+   .. prompt:: bash $
+
+       $ git push ceph-ci wip-yourname-feature-x 
+
+
 Naming the ceph-ci branch
 *************************
 Prepend your branch with your name before you push it to ceph-ci. For example,
@@ -110,15 +156,14 @@ the name of that stable branch in your ceph-ci branch name.
 For example, the ``feature-x`` PR branch should be named 
 ``wip-feature-x-nautilus``. *This is not just a convention. This ensures that your branch is built in the correct environment.*
 
-You can choose to only trigger a CentOS 9.Stream build (excluding other distro like ubuntu)
-by adding "centos9-only" at the end of the ceph-ci branch name. For example,
-``wip-$yourname-feature-centos9-only``. This helps to get quicker builds and save resources 
-when you don't require binaries for other distros. 
-
 Delete the branch from ceph-ci when you no longer need it. If you are
 logged in to GitHub, all your branches on ceph-ci can be found here:
 https://github.com/ceph/ceph-ci/branches.
 
+.. note:: You can choose to only trigger a CentOS 9.Stream build (excluding other 
+   distro like ubuntu) by adding "centos9-only" at the end of the ceph-ci branch name. 
+   For example, ``wip-$yourname-feature-centos9-only``. This helps to get quicker builds 
+   and save resources when you don't require binaries for other distros. 
 
 Scheduling Test Run
 -------------------
diff --git a/doc/dev/developer_guide/testing_integration_tests/workflow.png b/doc/dev/developer_guide/testing_integration_tests/workflow.png
new file mode 100644
index 00000000000..610baf683bc
--- /dev/null
+++ b/doc/dev/developer_guide/testing_integration_tests/workflow.png
diff --git a/doc/man/8/cephadm.rst b/doc/man/8/cephadm.rst
index b2cad6cb505..3c23a9867f7 100644
--- a/doc/man/8/cephadm.rst
+++ b/doc/man/8/cephadm.rst
@@ -13,7 +13,7 @@ Synopsis
 |               [--log-dir LOG_DIR] [--logrotate-dir LOGROTATE_DIR]
 |               [--unit-dir UNIT_DIR] [--verbose] [--timeout TIMEOUT]
 |               [--retry RETRY] [--no-container-init]
-|               {version,pull,inspect-image,ls,list-networks,adopt,rm-daemon,rm-cluster,run,shell,enter,ceph-volume,unit,logs,bootstrap,deploy,check-host,prepare-host,add-repo,rm-repo,install,list-images}
+|               {version,pull,inspect-image,ls,list-networks,adopt,rm-daemon,rm-cluster,run,shell,enter,ceph-volume,unit,logs,bootstrap,deploy,check-host,prepare-host,add-repo,rm-repo,install,list-images,update-osd-service}
 |               ...
 
 
@@ -106,6 +106,7 @@ Synopsis
 
 | **cephadm** **list-images**
 
+| **cephadm** **update-osd-service** [-h] [--fsid FSID] --osd-ids OSD_IDS --service-name SERVICE_NAME
 
 
 Description
@@ -535,6 +536,18 @@ list-images
 List the default container images for all services in ini format. The output can be modified with custom images and passed to --config flag during bootstrap.
 
 
+update-osd-service
+------------------
+
+Update the OSD service for specific OSDs
+
+Arguments:
+
+* [--fsid FSID]                 cluster FSID
+* --osd-ids OSD_IDS             Comma-separated OSD IDs
+* --service-name SERVICE_NAME   OSD service name
+
+
 Availability
 ============
 
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst
index f5d38948150..a1498a09fd0 100644
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -1665,6 +1665,14 @@ Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has
 crashed, the daemon log file (found at ``/var/log/ceph/``) may contain
 troubleshooting information.
 
+NVMEOF_GATEWAY_DELETING
+_______________________
+
+Some of the gateways are in the GW_DELETING state. They will stay in this
+state until all the namespaces under the gateway's load balancing group are 
+moved to another load balancing group ID. This is done automatically by the 
+load balancing process. If this alert persist for a long time, there might 
+be an issue with that process.
 
 Miscellaneous
 -------------
diff --git a/doc/rados/operations/stretch-mode.rst b/doc/rados/operations/stretch-mode.rst
index e8be5e13e6a..7a4fa46117d 100644
--- a/doc/rados/operations/stretch-mode.rst
+++ b/doc/rados/operations/stretch-mode.rst
@@ -119,13 +119,29 @@ See https://tracker.ceph.com/issues/68338 for more information.
 
 Stretch Mode
 ============
-Stretch mode is designed to handle deployments in which you cannot guarantee the
-replication of data across two data centers. This kind of situation can arise
-when the cluster's CRUSH rule specifies that three copies are to be made, but 
-then a copy is placed in each data center with a ``min_size`` of 2. Under such
-conditions, a placement group can become active with two copies in the first
-data center and no copies in the second data center. 
 
+Stretch mode is designed to handle netsplit scenarios between two data zones as well
+as the loss of one data zone. It handles the netsplit scenario by choosing the surviving zone
+that has the better connection to the ``tiebreaker monitor``. It handles the loss of one zone by
+reducing the ``size`` to ``2`` and ``min_size`` to ``1``, allowing the cluster to continue operating
+with the remaining zone. When the lost zone comes back, the cluster will recover the lost data
+and return to normal operation.
+
+Connectivity Monitor Election Strategy
+---------------------------------------
+When using stretch mode, the monitor election strategy must be set to ``connectivity``.
+This strategy tracks network connectivity between the monitors and is
+used to determine which zone should be favored when the cluster is in a netsplit scenario.
+
+See `Changing Monitor Elections`_
+
+Stretch Peering Rule
+--------------------
+One critical behavior of stretch mode is its ability to prevent a PG from going active if the acting set
+contains only replicas from a single zone. This safeguard is crucial for mitigating the risk of data
+loss during site failures because if a PG were allowed to go active with replicas only in a single site,
+writes could be acknowledged despite a lack of redundancy. In the event of a site failure, all data in the
+affected PG would be lost.
 
 Entering Stretch Mode
 ---------------------
@@ -271,7 +287,7 @@ possible, if needed).
 .. _Changing Monitor elections: ../change-mon-elections
 
 Exiting Stretch Mode
-=====================
+--------------------
 To exit stretch mode, run the following command:
 
 .. prompt:: bash $
diff --git a/doc/radosgw/account.rst b/doc/radosgw/account.rst
index 6dab997d93e..0e4ede5a50a 100644
--- a/doc/radosgw/account.rst
+++ b/doc/radosgw/account.rst
@@ -174,6 +174,11 @@ An existing user can be adopted into an account with ``user modify``::
 .. note:: Account membership is permanent. Once added, users cannot be
    removed from their account.
 
+.. note:: The IAM User API imposes additional requirements on the format
+   of ``UserName``, which is enforced when migrating users into an account.
+   If migration fails with "UserName contains invalid characters", the
+   ``--display-name`` should be modified to match ``[\w+=,.@-]+``.
+
 .. warning:: Ownership of the user's notification topics will not be
    transferred to the account. Notifications will continue to work, but
    the topics will no longer be visible to SNS Topic APIs. Topics and
diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst
index edc6a90b0f9..b4aa56fff54 100644
--- a/doc/radosgw/config-ref.rst
+++ b/doc/radosgw/config-ref.rst
@@ -78,7 +78,7 @@ These values can be tuned based upon your specific workload to further increase
 aggressiveness of lifecycle processing. For a workload with a larger number of buckets (thousands)
 you would look at increasing the :confval:`rgw_lc_max_worker` value from the default value of 3 whereas for a
 workload with a smaller number of buckets but higher number of objects (hundreds of thousands)
-per bucket you would consider decreasing :confval:`rgw_lc_max_wp_worker` from the default value of 3.
+per bucket you would consider increasing :confval:`rgw_lc_max_wp_worker` from the default value of 3.
 
 .. note:: When looking to tune either of these specific values please validate the
    current Cluster performance and Ceph Object Gateway utilization before increasing.
diff --git a/doc/radosgw/s3/objectops.rst b/doc/radosgw/s3/objectops.rst
index 2ac52607fe3..ddc5fb910c4 100644
--- a/doc/radosgw/s3/objectops.rst
+++ b/doc/radosgw/s3/objectops.rst
@@ -115,7 +115,7 @@ Request Headers
 +---------------------------+------------------------------------------------+--------------------------------+------------+
 | **if-match**              | Gets only if object ETag matches ETag.         | Entity Tag                     | No         |
 +---------------------------+------------------------------------------------+--------------------------------+------------+
-| **if-none-match**         | Gets only if object ETag matches ETag.         | Entity Tag                     | No         |
+| **if-none-match**         | Gets only if object ETag doesn't match.        | Entity Tag                     | No         |
 +---------------------------+------------------------------------------------+--------------------------------+------------+
 
 Response Headers
@@ -155,7 +155,7 @@ Request Headers
 +---------------------------+------------------------------------------------+--------------------------------+------------+
 | **if-match**              | Gets only if object ETag matches ETag.         | Entity Tag                     | No         |
 +---------------------------+------------------------------------------------+--------------------------------+------------+
-| **if-none-match**         | Gets only if object ETag matches ETag.         | Entity Tag                     | No         |
+| **if-none-match**         | Gets only if object ETag doesn't match         | Entity Tag                     | No         |
 +---------------------------+------------------------------------------------+--------------------------------+------------+
 
 Get Object ACL
diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet
index a15b88422fc..e917b4c2dac 100644
--- a/monitoring/ceph-mixin/config.libsonnet
+++ b/monitoring/ceph-mixin/config.libsonnet
@@ -9,12 +9,12 @@
     CephNodeNetworkPacketDropsPerSec: 10,
     CephRBDMirrorImageTransferBandwidthThreshold: 0.8,
     CephRBDMirrorImagesPerDaemonThreshold: 100,
-    NVMeoFMaxGatewaysPerGroup: 4,
-    NVMeoFMaxGatewaysPerCluster: 4,
+    NVMeoFMaxGatewaysPerGroup: 8,
+    NVMeoFMaxGatewaysPerCluster: 32,
     NVMeoFHighGatewayCPU: 80,
     NVMeoFMaxSubsystemsPerGateway: 128,
-    NVMeoFMaxNamespaces: 1024,
-    NVMeoFHighClientCount: 32,
+    NVMeoFMaxNamespaces: 2048,
+    NVMeoFHighClientCount: 128,
     NVMeoFHighHostCPU: 80,
     //
     // Read/Write latency is defined in ms
diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml
index 3440d761351..7c0da4d51a4 100644
--- a/monitoring/ceph-mixin/prometheus_alerts.yml
+++ b/monitoring/ceph-mixin/prometheus_alerts.yml
@@ -776,18 +776,18 @@ groups:
           type: "ceph_default"
       - alert: "NVMeoFTooManyGateways"
         annotations:
-          description: "You may create many gateways, but 4 is the tested limit"
+          description: "You may create many gateways, but 32 is the tested limit"
           summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}"
-        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00"
         for: "1m"
         labels:
           severity: "warning"
           type: "ceph_default"
       - alert: "NVMeoFMaxGatewayGroupSize"
         annotations:
-          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+          description: "You may create many gateways in a gateway group, but 8 is the tested limit"
           summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}"
-        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00"
+        expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -832,7 +832,7 @@ groups:
         annotations:
           description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported"
           summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}"
-        expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 1024.00"
+        expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00"
         for: "1m"
         labels:
           severity: "warning"
@@ -848,9 +848,9 @@ groups:
           type: "ceph_default"
       - alert: "NVMeoFHighClientCount"
         annotations:
-          description: "The supported limit for clients connecting to a subsystem is 32"
+          description: "The supported limit for clients connecting to a subsystem is 128"
           summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}"
-        expr: "ceph_nvmeof_subsystem_host_count > 32.00"
+        expr: "ceph_nvmeof_subsystem_host_count > 128.00"
         for: "1m"
         labels:
           severity: "warning"
diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
index b3b29308d08..83b4ff80375 100644
--- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
+++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
@@ -2331,12 +2331,69 @@ tests:
       values: '1+0x20'
     - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}'
       values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}'
+      values: '1+0x20'
+
    promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00
        eval_time: 1m
        exp_samples:
          - labels: '{cluster="mycluster"}'
-           value: 5
+           value: 33
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFTooManyGateways
@@ -2347,7 +2404,7 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "Max supported gateways exceeded on cluster mycluster"
-          description: "You may create many gateways, but 4 is the tested limit"
+          description: "You may create many gateways, but 32 is the tested limit"
 
  # NVMeoFMaxGatewayGroupSize
  - interval: 1m
@@ -2362,16 +2419,24 @@ tests:
       values: '1+0x20'
     - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}'
       values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}'
+      values: '1+0x20'
+    - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}'
+      values: '1+0x20' 
     - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}'
       values: '1+0x20'
     - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}'
       values: '1+0x20'
    promql_expr_test:
-     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00
+     - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00
        eval_time: 1m
        exp_samples:
          - labels: '{cluster="mycluster",group="group-1"}'
-           value: 5
+           value: 9
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFMaxGatewayGroupSize
@@ -2383,7 +2448,7 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster"
-          description: "You may create many gateways in a gateway group, but 4 is the tested limit"
+          description: "You may create many gateways in a gateway group, but 8 is the tested limit"
 
  # NVMeoFSingleGatewayGroup
  - interval: 1m
@@ -2767,12 +2832,14 @@ tests:
       values: '200+0x10'
     - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}'
       values: '200+0x10'
+    - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}'
+      values: '200+0x10'
    promql_expr_test:
-     - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 1024
+     - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 2048
        eval_time: 1m
        exp_samples:
          - labels: '{gateway_host="node-1", cluster="mycluster"}'
-           value: 2000
+           value: 2200
    alert_rule_test:
     - eval_time: 5m
       alertname: NVMeoFTooManyNamespaces
@@ -2815,15 +2882,15 @@ tests:
  - interval: 1m
    input_series:
     - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}'
-      values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44'
+      values: '2 4 8 10 20 30 40 50 62 74 80 95 100 110 130 130 130 130 130 130'
     - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}'
-      values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16'
+      values: '2 8 16 16 16 16 16 16 16 16 20 20 32 34 34 36 37 37 37 37'
    promql_expr_test:
-     - expr: ceph_nvmeof_subsystem_host_count > 32.00
+     - expr: ceph_nvmeof_subsystem_host_count > 128.00
        eval_time: 15m
        exp_samples:
          - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}'
-           value: 38
+           value: 130
    alert_rule_test:
     - eval_time: 20m
       alertname: NVMeoFHighClientCount
@@ -2835,7 +2902,7 @@ tests:
           type: ceph_default
         exp_annotations:
           summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
-          description: "The supported limit for clients connecting to a subsystem is 32"
+          description: "The supported limit for clients connecting to a subsystem is 128"
  
  # NVMeoFMissingListener
  - interval: 1m
diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa
index fea2489fdf6..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/.qa
+++ b/qa/suites/crimson-rados-experimental/.qa
@@ -1 +1 @@
-../.qa
-\ No newline at end of file
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml
deleted file mode 120000
index bd9854e7029..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/supported/centos_latest.yaml
-\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml
deleted file mode 100644
index d8e5898b99f..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-overrides:
-  ceph-deploy:
-    conf:
-      global:
-        osd pool default size: 2
-        osd crush chooseleaf type: 0
-        osd pool default pg num:  128
-        osd pool default pgp num:  128
-  ceph:
-    conf:
-      osd:
-        osd shutdown pgref assert: true
-roles:
-- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0]
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml
deleted file mode 100644
index c22f08eecf8..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-overrides:
-  install:
-    ceph:
-      flavor: crimson
-tasks:
-- install:
-- ceph:
-    conf:
-      osd:
-        debug monc: 20
-      mon:
-        mon min osdmap epochs: 50
-        paxos service trim min: 10
-        # prune full osdmaps regularly
-        mon osdmap full prune min: 15
-        mon osdmap full prune interval: 2
-        mon osdmap full prune txsize: 2
-    flavor: crimson
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml
deleted file mode 100644
index ad8c921425b..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-overrides:
-  ceph:
-    log-ignorelist:
-    - reached quota
-    - but it is still running
-    - overall HEALTH_
-    - \(POOL_FULL\)
-    - \(SMALLER_PGP_NUM\)
-    - \(CACHE_POOL_NO_HIT_SET\)
-    - \(CACHE_POOL_NEAR_FULL\)
-    - \(POOL_APP_NOT_ENABLED\)
-    - \(PG_AVAILABILITY\)
-    - \(PG_DEGRADED\)
-    conf:
-      client:
-        debug ms: 1
-      mon:
-        mon warn on pool no app: false
-      osd:
-        osd class load list: "*"
-        osd class default list: "*"
-        osd blocked scrub grace period: 3600
-tasks:
-- workunit:
-    clients:
-      client.0:
-        - rados/test.sh
-        - rados/test_pool_quota.sh
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml
deleted file mode 100644
index 25efcdac83d..00000000000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-overrides:
-  ceph:
-    crush_tunables: optimal
-    conf:
-      mon:
-        mon osd initial require min compat client: luminous
-      osd:
-        osd_discard_disconnected_ops: false
-tasks:
-- rados:
-    clients: [client.0]
-    ops: 4000
-    objects: 500
-    max_attr_len: 8192
-    op_weights:
-      read: 45
-      write: 45
-      delete: 10
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/suites/crimson-rados-experimental/thrash/%
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/crimson-rados-experimental/seastore/basic/%
+++ b/qa/suites/crimson-rados-experimental/thrash/%
diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/.qa
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled
new file mode 120000
index 00000000000..5393a75548a
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled
@@ -0,0 +1 @@
+.qa/overrides/2-size-2-min-size.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml
new file mode 120000
index 00000000000..5ff70eadf75
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml
@@ -0,0 +1 @@
+.qa/overrides/3-size-2-min-size.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml
diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml
index abd86d7d986..abd86d7d986 120000
--- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled
+++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled
new file mode 120000
index 00000000000..47afd70202d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled
@@ -0,0 +1 @@
+.qa/overrides/more-active-recovery.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled
new file mode 100644
index 00000000000..0bbc72db754
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled
@@ -0,0 +1,6 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_async_recovery_min_cost: 1
+        osd_object_clean_region_max_num_intervals: 1000
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled
new file mode 100644
index 00000000000..4aed086bcc3
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled
@@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_async_recovery_min_cost: 1
diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled
new file mode 100644
index 00000000000..88f15f2f691
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled
@@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd_object_clean_region_max_num_intervals: 1000
diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml
index 9774de6887b..79641f695ab 100644
--- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml
@@ -6,6 +6,15 @@ overrides:
     conf:
       osd:
         osd shutdown pgref assert: true
+        crimson alien thread cpu cores: 6-7
+      osd.0:
+        crimson seastar cpu cores: 0-2
+      osd.1:
+        crimson seastar cpu cores: 3-5
+      osd.2:
+        crimson seastar cpu cores: 0-2
+      osd.3:
+        crimson seastar cpu cores: 3-5
       global:
         ms cluster mode: crc
         ms service mode: crc
diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled
new file mode 100644
index 00000000000..e559d9126e8
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled
@@ -0,0 +1,4 @@
+openstack:
+  - volumes: # attached to each instance
+      count: 4
+      size: 10 # GB
diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro
new file mode 120000
index 00000000000..a5b729b9efa
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro
@@ -0,0 +1 @@
+.qa/distros/crimson-supported-all-distro/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml
index 2bf67af1b18..2bf67af1b18 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa
index a602a0353e7..a602a0353e7 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa
diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml
new file mode 100644
index 00000000000..ecad09cfe3a
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml
@@ -0,0 +1,11 @@
+overrides:
+  install:
+    ceph:
+      flavor: crimson
+tasks:
+- install:
+- ceph:
+    conf:
+      osd:
+        debug monc: 20
+    flavor: crimson
diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled
new file mode 100644
index 00000000000..0c2062240ee
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled
@@ -0,0 +1,16 @@
+# no need to verify os + flavor + sha1
+verify_ceph_hash: false
+tasks:
+- cephadm:
+    conf:
+      mgr:
+        debug ms: 1
+        debug mgr: 20
+        debug osd: 10
+- cephadm.shell:
+    mon.a:
+      - ceph orch status
+      - ceph orch ps
+      - ceph orch ls
+      - ceph orch host ls
+      - ceph orch device ls
diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml
index 61e26e7acf8..61e26e7acf8 120000
--- a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml
+++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml
new file mode 100644
index 00000000000..aa44b6101ff
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml
@@ -0,0 +1,34 @@
+overrides:
+  ceph:
+    log-ignorelist:
+    - but it is still running
+    - objects unfound and apparently lost
+    conf:
+      osd:
+        osd debug reject backfill probability: .3
+        osd scrub min interval: 60
+        osd scrub max interval: 120
+        osd max backfills: 3
+        osd snap trim sleep: 2
+        osd delete sleep: 1
+      mon:
+        mon min osdmap epochs: 50
+        paxos service trim min: 10
+        # prune full osdmaps regularly
+        mon osdmap full prune min: 15
+        mon osdmap full prune interval: 2
+        mon osdmap full prune txsize: 2
+tasks:
+- thrashosds:
+    timeout: 2400
+    dump_ops_enable: false
+    sighup_delay: 0
+    min_in: 3
+    noscrub_toggle_delay: 0
+    chance_thrash_pg_upmap: 0
+    reweight_osd: 0
+    thrash_primary_affinity: false
+    ceph_objectstore_tool: false
+    chance_inject_pause_short: 0
+    chance_thrash_cluster_full: 0
+    chance_reset_purged_snaps_last: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml
new file mode 120000
index 00000000000..9124eb1aa29
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml
@@ -0,0 +1 @@
+.qa/tasks/thrashosds-health.yaml
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa
new file mode 120000
index 00000000000..a602a0353e7
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml
new file mode 100644
index 00000000000..8c9764ade84
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml
@@ -0,0 +1,13 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        admin socket: /var/run/ceph/ceph-$name.asok
+tasks:
+- radosbench:
+    clients: [client.0]
+    time: 150
+- admin_socket:
+    client.0:
+      objecter_requests:
+        test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}"
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml
new file mode 100644
index 00000000000..d35e8421ab4
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml
@@ -0,0 +1,20 @@
+overrides:
+  conf:
+    osd:
+      osd deep scrub update digest min age: 0
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    pool_snaps: true
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml
new file mode 100644
index 00000000000..902c4b56a1e
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml
@@ -0,0 +1,49 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        debug ms: 1
+        debug objecter: 20
+        debug rados: 20
+tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      concurrency: 128
+      size: 8192
+      time: 90
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml
new file mode 100644
index 00000000000..071f55e3928
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    conf:
+      client.0:
+        debug ms: 1
+        debug objecter: 20
+        debug rados: 20
+tasks:
+- full_sequential:
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
+  - radosbench:
+      clients: [client.0]
+      time: 90
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml
new file mode 100644
index 00000000000..afe04229898
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    crush_tunables: jewel
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 400000
+    max_seconds: 600
+    max_in_flight: 64
+    objects: 1024
+    size: 16384
+    balance_reads: true
+    max_attr_len: 8192
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+      setattr: 25
+      rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml
new file mode 100644
index 00000000000..445b582ea42
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    crush_tunables: jewel
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 400000
+    max_seconds: 600
+    max_in_flight: 64
+    objects: 1024
+    size: 16384
+    localize_reads: true
+    max_attr_len: 8192
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+      setattr: 25
+      rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml
new file mode 100644
index 00000000000..e7e8070fd76
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml
@@ -0,0 +1,23 @@
+overrides:
+  ceph:
+    crush_tunables: jewel
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 400000
+    max_seconds: 600
+    max_in_flight: 64
+    objects: 1024
+    size: 16384
+    max_attr_len: 8192
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
+      setattr: 25
+      rmattr: 25
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml
new file mode 100644
index 00000000000..1161c3cc253
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml
@@ -0,0 +1,15 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    balance_reads: true
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml
new file mode 100644
index 00000000000..80af0def0e4
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml
@@ -0,0 +1,15 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    localize_reads: true
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml
new file mode 100644
index 00000000000..0694ffcd0d6
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml
@@ -0,0 +1,14 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 50
+    op_weights:
+      read: 100
+      write: 100
+      delete: 50
+      snap_create: 50
+      snap_remove: 50
+      rollback: 0
+      # TODO: CEPH_OSD_OP_COPY_FROM
+      copy_from: 0
diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml
new file mode 100644
index 00000000000..606dcae6922
--- /dev/null
+++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml
@@ -0,0 +1,8 @@
+tasks:
+- rados:
+    clients: [client.0]
+    ops: 4000
+    objects: 500
+    write_fadvise_dontneed: true
+    op_weights:
+      write: 100
diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml
index abd86d7d986..abd86d7d986 120000
--- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled
+++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml
diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/crimson_seastore.yaml b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled
index 61e26e7acf8..61e26e7acf8 120000
--- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/crimson_seastore.yaml
+++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled
diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml
new file mode 120000
index 00000000000..abd86d7d986
--- /dev/null
+++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml
@@ -0,0 +1 @@
+.qa/overrides/short_pg_log.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml
index e2dc29b5f7e..17cf141b0cd 100644
--- a/qa/suites/rados/verify/validater/valgrind.yaml
+++ b/qa/suites/rados/verify/validater/valgrind.yaml
@@ -27,6 +27,7 @@ overrides:
       - \(SLOW_OPS\)
       - slow request
       - OSD bench result
+      - OSD_DOWN
     valgrind:
       mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
       osd: [--tool=memcheck]
diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
index 40fbcefe728..62fb6427f72 100644
--- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml
@@ -32,13 +32,22 @@ overrides:
       osd:
         osd shutdown pgref assert: true
     log-ignorelist:
-      - \(POOL_APP_NOT_ENABLED\)
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
       - OSD_DOWN
       - mons down
       - mon down
       - MON_DOWN
       - out of quorum
+      - PG_AVAILABILITY
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
       - OSDMAP_FLAGS
+      - OSD_UPGRADE_FINISHED
diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
index e27c7c0f092..f7167975aa9 100644
--- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml
@@ -1,11 +1,8 @@
 overrides:
   ceph:
     log-ignorelist:
-      - mons down
-      - mon down
-      - MON_DOWN
-      - out of quorum
-      - PG_AVAILABILITY
+      - Telemetry requires re-opt-in
+      - telemetry module includes new collections
 tasks:
 - install:
     branch: quincy
diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
index 005514292ce..5641471629e 100644
--- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
+++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml
@@ -1,17 +1,25 @@
 overrides:
   ceph:
     log-ignorelist:
-      - \(POOL_APP_NOT_ENABLED\)
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
       - OSD_DOWN
       - mons down
       - mon down
       - MON_DOWN
       - out of quorum
+      - PG_AVAILABILITY
       - PG_DEGRADED
       - Reduced data availability
       - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
       - OSDMAP_FLAGS
-      - PG_AVAILABILITY
+      - OSD_UPGRADE_FINISHED
 tasks:
 - install:
     branch: quincy
diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
index 146bd57960d..62fb6427f72 100644
--- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml
@@ -32,4 +32,22 @@ overrides:
       osd:
         osd shutdown pgref assert: true
     log-ignorelist:
-        - PG_DEGRADED
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
+      - OSD_DOWN
+      - mons down
+      - mon down
+      - MON_DOWN
+      - out of quorum
+      - PG_AVAILABILITY
+      - PG_DEGRADED
+      - Reduced data availability
+      - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
+      - OSDMAP_FLAGS
+      - OSD_UPGRADE_FINISHED
diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
index ce4e0cc228b..b5160c2dd00 100644
--- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml
@@ -1,12 +1,8 @@
 overrides:
   ceph:
     log-ignorelist:
-      - mons down
-      - mon down
-      - MON_DOWN
-      - out of quorum
-      - PG_AVAILABILITY
-      - PG_DEGRADED
+      - Telemetry requires re-opt-in
+      - telemetry module includes new collections
 tasks:
 - install:
     branch: reef
diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
index 5e995da7d2c..fa93b2f2ece 100644
--- a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
+++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml
@@ -1,20 +1,19 @@
 overrides:
   ceph:
     log-ignorelist:
-      - \(MDS_ALL_DOWN\)
-      - \(MDS_UP_LESS_THAN_MAX\)
-      - \(OSD_SLOW_PING_TIME
+      - MDS_ALL_DOWN
+      - MDS_UP_LESS_THAN_MAX
+      - OSD_SLOW_PING_TIME
       - reached quota
+      - running out of quota
       - overall HEALTH_
-      - \(CACHE_POOL_NO_HIT_SET\)
-      - \(POOL_FULL\)
-      - \(SMALLER_PGP_NUM\)
-      - \(SLOW_OPS\)
-      - \(CACHE_POOL_NEAR_FULL\)
-      - \(POOL_APP_NOT_ENABLED\)
-      - \(PG_AVAILABILITY\)
-      - \(OBJECT_MISPLACED\)
+      - CACHE_POOL_NO_HIT_SET
+      - pool\(s\) full
+      - POOL_FULL
+      - SMALLER_PGP_NUM
+      - SLOW_OPS
+      - CACHE_POOL_NEAR_FULL
+      - OBJECT_MISPLACED
       - slow request
-      - \(MON_DOWN\)
       - noscrub
       - nodeep-scrub
diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
index 992f9e1bc36..59ccfe2cd02 100644
--- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml
@@ -1,11 +1,25 @@
 overrides:
   ceph:
     log-ignorelist:
+      - do not have an application enabled
+      - application not enabled
+      - or freeform for custom applications
+      - POOL_APP_NOT_ENABLED
+      - is down
+      - OSD_DOWN
       - mons down
       - mon down
       - MON_DOWN
       - out of quorum
       - PG_AVAILABILITY
+      - PG_DEGRADED
+      - Reduced data availability
+      - Degraded data redundancy
+      - pg .* is stuck inactive
+      - pg .* is .*degraded
+      - FS_DEGRADED
+      - OSDMAP_FLAGS
+      - OSD_UPGRADE_FINISHED
 tasks:
 - install:
     branch: reef
diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
index 5e995da7d2c..fa93b2f2ece 100644
--- a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
+++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml
@@ -1,20 +1,19 @@
 overrides:
   ceph:
     log-ignorelist:
-      - \(MDS_ALL_DOWN\)
-      - \(MDS_UP_LESS_THAN_MAX\)
-      - \(OSD_SLOW_PING_TIME
+      - MDS_ALL_DOWN
+      - MDS_UP_LESS_THAN_MAX
+      - OSD_SLOW_PING_TIME
       - reached quota
+      - running out of quota
       - overall HEALTH_
-      - \(CACHE_POOL_NO_HIT_SET\)
-      - \(POOL_FULL\)
-      - \(SMALLER_PGP_NUM\)
-      - \(SLOW_OPS\)
-      - \(CACHE_POOL_NEAR_FULL\)
-      - \(POOL_APP_NOT_ENABLED\)
-      - \(PG_AVAILABILITY\)
-      - \(OBJECT_MISPLACED\)
+      - CACHE_POOL_NO_HIT_SET
+      - pool\(s\) full
+      - POOL_FULL
+      - SMALLER_PGP_NUM
+      - SLOW_OPS
+      - CACHE_POOL_NEAR_FULL
+      - OBJECT_MISPLACED
       - slow request
-      - \(MON_DOWN\)
       - noscrub
       - nodeep-scrub
diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py
index 346f139874b..468378fce3d 100644
--- a/qa/tasks/cephfs/test_exports.py
+++ b/qa/tasks/cephfs/test_exports.py
@@ -153,6 +153,8 @@ class TestExportPin(CephFSTestCase):
         # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap
         # to be written out every event. Yuck!
         self.config_set('mds', 'mds_debug_subtrees', False)
+        # make sure ESubtreeMap is written frequently enough:
+        self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4')
         self.config_rm('mds', 'mds bal split size') # don't split /top
         self.mount_a.run_shell_payload("rm -rf 1")
 
diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py
index c58a7267b4e..691a6f7dd86 100644
--- a/qa/tasks/nvmeof.py
+++ b/qa/tasks/nvmeof.py
@@ -315,7 +315,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
 
     def _get_devices(self, remote):
         GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \
-            "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'"
+            "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'"
         devices = remote.sh(GET_DEVICE_CMD).split()
         return devices
     
diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py
index e83a54efc2b..f93ca017fa2 100644
--- a/qa/tasks/rgw_multisite.py
+++ b/qa/tasks/rgw_multisite.py
@@ -361,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config):
     if endpoints:
         # replace client names with their gateway endpoints
         config['endpoints'] = extract_gateway_endpoints(gateways, endpoints)
+    if not config.get('api_name'): # otherwise it will be set to an empty string
+        config['api_name'] = config['name']
     zonegroup = multisite.ZoneGroup(config['name'], period)
     # `zonegroup set` needs --default on command line, and 'is_master' in json
     args = is_default_arg(config)
diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh
index 794353348b4..9e7a1f5134e 100755
--- a/qa/workunits/nvmeof/basic_tests.sh
+++ b/qa/workunits/nvmeof/basic_tests.sh
@@ -39,7 +39,7 @@ connect_all() {
     sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600
     sleep 5
     expected_devices_count=$1
-    actual_devices=$(sudo nvme list --output-format=json | grep -o "$SPDK_CONTROLLER" | wc -l) 
+    actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l)
     if [ "$actual_devices" -ne "$expected_devices_count" ]; then
         sudo nvme list --output-format=json
         return 1
@@ -74,7 +74,7 @@ test_run connect
 test_run list_subsys 1
 test_run disconnect_all
 test_run list_subsys 0
-devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT)) 
+devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT )) 
 test_run connect_all $devices_count
 gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
 multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) 
diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh
index 03fb58693bd..f7f783afc67 100755
--- a/qa/workunits/nvmeof/fio_test.sh
+++ b/qa/workunits/nvmeof/fio_test.sh
@@ -34,7 +34,7 @@ done
 
 fio_file=$(mktemp -t nvmeof-fio-XXXX)
 all_drives_list=$(sudo nvme list --output-format=json | 
-    jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath')
+    jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace')
 
 # When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), 
 # then fio runs on namespaces only in the defined range (which is 1 to 3 here). 
diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
index c278de43eb0..a6d82c7f0fa 100644
--- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py
+++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py
@@ -119,13 +119,12 @@ class Zap:
                     osd_uuid = details.get('osd_uuid')
                     break
 
-        for osd_uuid, details in raw_report.items():
+        for _, details in raw_report.items():
             device: str = details.get('device')
             if details.get('osd_uuid') == osd_uuid:
                 raw_devices.add(device)
 
         return list(raw_devices)
-        
 
     def find_associated_devices(self) -> List[api.Volume]:
         """From an ``osd_id`` and/or an ``osd_fsid``, filter out all the Logical Volumes (LVs) in the
diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
index cca64e83ab0..c971b7776ef 100644
--- a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
+++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py
@@ -22,7 +22,7 @@ ceph_bluestore_tool_output = '''
         "whoami": "0"
     },
     "/dev/vdx": {
-        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6",
+        "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b7",
         "size": 214748364800,
         "btime": "2024-10-16T10:51:05.955279+0000",
         "description": "main",
diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py
index 77b55314f66..921e61a4534 100644
--- a/src/ceph-volume/ceph_volume/util/disk.py
+++ b/src/ceph-volume/ceph_volume/util/disk.py
@@ -347,12 +347,21 @@ def lsblk_all(device: str = '',
     return result
 
 
-def is_device(dev):
+def is_device(dev: str) -> bool:
     """
-    Boolean to determine if a given device is a block device (**not**
-    a partition!)
+    Determines whether the given path corresponds to a block device (not a partition).
 
-    For example: /dev/sda would return True, but not /dev/sdc1
+    This function checks whether the provided device path represents a valid block device,
+    such as a physical disk (/dev/sda) or an allowed loop device, but excludes partitions
+    (/dev/sdc1). It performs several validation steps, including file existence, path format,
+    device type, and additional checks for loop devices if allowed.
+
+    Args:
+        dev (str): The path to the device (e.g., "/dev/sda").
+
+    Returns:
+        bool: True if the path corresponds to a valid block device (not a partition),
+              otherwise False.
     """
     if not os.path.exists(dev):
         return False
@@ -364,7 +373,7 @@ def is_device(dev):
 
     TYPE = lsblk(dev).get('TYPE')
     if TYPE:
-        return TYPE in ['disk', 'mpath']
+        return TYPE in ['disk', 'mpath', 'loop']
 
     # fallback to stat
     return _stat_is_device(os.lstat(dev).st_mode) and not is_partition(dev)
diff --git a/src/ceph-volume/ceph_volume/util/prepare.py b/src/ceph-volume/ceph_volume/util/prepare.py
index 9c863b83d93..ff7fc023fc4 100644
--- a/src/ceph-volume/ceph_volume/util/prepare.py
+++ b/src/ceph-volume/ceph_volume/util/prepare.py
@@ -9,6 +9,7 @@ import logging
 import json
 from ceph_volume import process, conf, terminal
 from ceph_volume.util import system, constants, str_to_int, disk
+from typing import Optional
 
 logger = logging.getLogger(__name__)
 mlogger = terminal.MultiLogger(__name__)
@@ -121,7 +122,7 @@ def get_block_wal_size(lv_format=True):
     return wal_size
 
 
-def create_id(fsid, json_secrets, osd_id=None):
+def create_id(fsid: str, json_secrets: str, osd_id: Optional[str]=None) -> str:
     """
     :param fsid: The osd fsid to create, always required
     :param json_secrets: a json-ready object with whatever secrets are wanted
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index d2ddf564116..a8616980e4d 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -111,6 +111,7 @@ from cephadmlib.file_utils import (
     unlink_file,
     write_new,
     write_tmp,
+    update_meta_file,
 )
 from cephadmlib.net_utils import (
     build_addrv_params,
@@ -3453,6 +3454,7 @@ def list_daemons(
     detail: bool = True,
     legacy_dir: Optional[str] = None,
     daemon_name: Optional[str] = None,
+    type_of_daemon: Optional[str] = None,
 ) -> List[Dict[str, str]]:
     host_version: Optional[str] = None
     ls = []
@@ -3489,6 +3491,8 @@ def list_daemons(
     if os.path.exists(data_dir):
         for i in os.listdir(data_dir):
             if i in ['mon', 'osd', 'mds', 'mgr', 'rgw']:
+                if type_of_daemon and type_of_daemon != i:
+                    continue
                 daemon_type = i
                 for j in os.listdir(os.path.join(data_dir, i)):
                     if '-' not in j:
@@ -3525,6 +3529,8 @@ def list_daemons(
                         if daemon_name and name != daemon_name:
                             continue
                         (daemon_type, daemon_id) = j.split('.', 1)
+                        if type_of_daemon and type_of_daemon != daemon_type:
+                            continue
                         unit_name = get_unit_name(fsid,
                                                   daemon_type,
                                                   daemon_id)
@@ -4705,6 +4711,34 @@ def command_list_images(ctx: CephadmContext) -> None:
     # print default images
     cp_obj.write(sys.stdout)
 
+
+def update_service_for_daemon(ctx: CephadmContext,
+                              available_daemons: list,
+                              update_daemons: list) -> None:
+    """ Update the unit.meta file of daemon with required service name for valid daemons"""
+
+    data = {'service_name': ctx.service_name}
+    # check if all the daemon names are valid
+    if not set(update_daemons).issubset(set(available_daemons)):
+        raise Error(f'Error EINVAL: one or more daemons of {update_daemons} does not exist on this host')
+    for name in update_daemons:
+        path = os.path.join(ctx.data_dir, ctx.fsid, name, 'unit.meta')
+        update_meta_file(path, data)
+        print(f'Successfully updated daemon {name} with service {ctx.service_name}')
+
+
+@infer_fsid
+def command_update_osd_service(ctx: CephadmContext) -> int:
+    """update service for provided daemon"""
+    update_daemons = [f'osd.{osd_id}' for osd_id in ctx.osd_ids.split(',')]
+    daemons = list_daemons(ctx, detail=False, type_of_daemon='osd')
+    if not daemons:
+        raise Error(f'Daemon {ctx.osd_ids} does not exists on this host')
+    available_daemons = [d['name'] for d in daemons]
+    update_service_for_daemon(ctx, available_daemons, update_daemons)
+    return 0
+
+
 ##################################
 
 
@@ -5571,6 +5605,14 @@ def _get_parser():
     parser_list_images = subparsers.add_parser(
         'list-images', help='list all the default images')
     parser_list_images.set_defaults(func=command_list_images)
+
+    parser_update_service = subparsers.add_parser(
+        'update-osd-service', help='update service for provided daemon')
+    parser_update_service.set_defaults(func=command_update_osd_service)
+    parser_update_service.add_argument('--fsid', help='cluster FSID')
+    parser_update_service.add_argument('--osd-ids', required=True, help='Comma-separated OSD IDs')
+    parser_update_service.add_argument('--service-name', required=True, help='OSD service name')
+
     return parser
 
 
diff --git a/src/cephadm/cephadmlib/daemons/monitoring.py b/src/cephadm/cephadmlib/daemons/monitoring.py
index 9a9402632b0..4ba00daaefb 100644
--- a/src/cephadm/cephadmlib/daemons/monitoring.py
+++ b/src/cephadm/cephadmlib/daemons/monitoring.py
@@ -16,7 +16,13 @@ from ..daemon_form import register as register_daemon_form
 from ..daemon_identity import DaemonIdentity
 from ..deployment_utils import to_deployment_container
 from ..exceptions import Error
-from ..net_utils import get_fqdn, get_hostname, get_ip_addresses, wrap_ipv6
+from ..net_utils import (
+    get_fqdn,
+    get_hostname,
+    get_ip_addresses,
+    wrap_ipv6,
+    EndPoint,
+)
 
 
 @register_daemon_form
@@ -89,11 +95,6 @@ class Monitoring(ContainerDaemonForm):
             'image': DefaultImages.ALERTMANAGER.image_ref,
             'cpus': '2',
             'memory': '2GB',
-            'args': [
-                '--cluster.listen-address=:{}'.format(
-                    port_map['alertmanager'][1]
-                ),
-            ],
             'config-json-files': [
                 'alertmanager.yml',
             ],
@@ -248,11 +249,14 @@ class Monitoring(ContainerDaemonForm):
                     ip = meta['ip']
                 if 'ports' in meta and meta['ports']:
                     port = meta['ports'][0]
-            if daemon_type == 'prometheus':
-                config = fetch_configs(ctx)
+            config = fetch_configs(ctx)
+            if daemon_type in ['prometheus', 'alertmanager']:
                 ip_to_bind_to = config.get('ip_to_bind_to', '')
                 if ip_to_bind_to:
                     ip = ip_to_bind_to
+                web_listen_addr = str(EndPoint(ip, port))
+                r += [f'--web.listen-address={web_listen_addr}']
+            if daemon_type == 'prometheus':
                 retention_time = config.get('retention_time', '15d')
                 retention_size = config.get(
                     'retention_size', '0'
@@ -276,9 +280,11 @@ class Monitoring(ContainerDaemonForm):
                     r += ['--web.route-prefix=/prometheus/']
                 else:
                     r += [f'--web.external-url={scheme}://{host}:{port}']
-            r += [f'--web.listen-address={ip}:{port}']
         if daemon_type == 'alertmanager':
-            config = fetch_configs(ctx)
+            clus_listen_addr = str(
+                EndPoint(ip, self.port_map[daemon_type][1])
+            )
+            r += [f'--cluster.listen-address={clus_listen_addr}']
             use_url_prefix = config.get('use_url_prefix', False)
             peers = config.get('peers', list())  # type: ignore
             for peer in peers:
@@ -294,13 +300,11 @@ class Monitoring(ContainerDaemonForm):
         if daemon_type == 'promtail':
             r += ['--config.expand-env']
         if daemon_type == 'prometheus':
-            config = fetch_configs(ctx)
             try:
                 r += [f'--web.config.file={config["web_config"]}']
             except KeyError:
                 pass
         if daemon_type == 'node-exporter':
-            config = fetch_configs(ctx)
             try:
                 r += [f'--web.config.file={config["web_config"]}']
             except KeyError:
diff --git a/src/cephadm/cephadmlib/file_utils.py b/src/cephadm/cephadmlib/file_utils.py
index 27e70e31756..4dd88cc3671 100644
--- a/src/cephadm/cephadmlib/file_utils.py
+++ b/src/cephadm/cephadmlib/file_utils.py
@@ -5,6 +5,7 @@ import datetime
 import logging
 import os
 import tempfile
+import json
 
 from contextlib import contextmanager
 from pathlib import Path
@@ -157,3 +158,26 @@ def unlink_file(
     except Exception:
         if not ignore_errors:
             raise
+
+
+def update_meta_file(file_path: str, update_key_val: dict) -> None:
+    """Update key in the file with provided value"""
+    try:
+        with open(file_path, 'r') as fh:
+            data = json.load(fh)
+        file_stat = os.stat(file_path)
+    except FileNotFoundError:
+        raise
+    except Exception:
+        logger.exception(f'Failed to update {file_path}')
+        raise
+    data.update(
+        {key: value for key, value in update_key_val.items() if key in data}
+    )
+
+    with write_new(
+        file_path,
+        owner=(file_stat.st_uid, file_stat.st_gid),
+        perms=(file_stat.st_mode & 0o777),
+    ) as fh:
+        fh.write(json.dumps(data, indent=4) + '\n')
diff --git a/src/cephadm/cephadmlib/net_utils.py b/src/cephadm/cephadmlib/net_utils.py
index 9a7f138b1c6..bfa61d933ef 100644
--- a/src/cephadm/cephadmlib/net_utils.py
+++ b/src/cephadm/cephadmlib/net_utils.py
@@ -24,12 +24,22 @@ class EndPoint:
     def __init__(self, ip: str, port: int) -> None:
         self.ip = ip
         self.port = port
+        self.is_ipv4 = True
+        try:
+            if ip and ipaddress.ip_network(ip).version == 6:
+                self.is_ipv4 = False
+        except Exception:
+            logger.exception('Failed to check ip address version')
 
     def __str__(self) -> str:
-        return f'{self.ip}:{self.port}'
+        if self.is_ipv4:
+            return f'{self.ip}:{self.port}'
+        return f'[{self.ip}]:{self.port}'
 
     def __repr__(self) -> str:
-        return f'{self.ip}:{self.port}'
+        if self.is_ipv4:
+            return f'{self.ip}:{self.port}'
+        return f'[{self.ip}]:{self.port}'
 
 
 def attempt_bind(ctx, s, address, port):
diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py
index c5094db335f..1736639ed55 100644
--- a/src/cephadm/tests/test_deploy.py
+++ b/src/cephadm/tests/test_deploy.py
@@ -316,7 +316,7 @@ def test_deploy_a_monitoring_container(cephadm_fs, funkypatch):
         runfile_lines = f.read().splitlines()
     assert 'podman' in runfile_lines[-1]
     assert runfile_lines[-1].endswith(
-        'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095 --web.listen-address=1.2.3.4:9095'
+        'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.listen-address=1.2.3.4:9095 --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095'
     )
     assert '--user 8765' in runfile_lines[-1]
     assert f'-v /var/lib/ceph/{fsid}/prometheus.fire/etc/prometheus:/etc/prometheus:Z' in runfile_lines[-1]
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index ea3cce16609..c607839a8d2 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -13,6 +13,7 @@ if(WIN32)
 endif()
 
 add_subdirectory(io_exerciser)
+add_subdirectory(json)
 add_subdirectory(options)
 
 set(common_srcs
diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp
index 961d9a0192e..c5fd491ed29 100644
--- a/src/common/bit_vector.hpp
+++ b/src/common/bit_vector.hpp
@@ -29,8 +29,8 @@ private:
   static const uint8_t MASK = static_cast<uint8_t>((1 << _bit_count) - 1);
 
   // must be power of 2
-  BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
-  BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
+  static_assert((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
+  static_assert(_bit_count <= BITS_PER_BYTE);
 
   template <typename DataIterator>
   class ReferenceImpl {
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h
index 01feff4c063..0b05be5372e 100644
--- a/src/common/ceph_time.h
+++ b/src/common/ceph_time.h
@@ -342,6 +342,23 @@ public:
   }
 };
 
+// Please note time_guard is not thread safety -- multiple threads
+// updating same diff_accumulator can corrupt it.
+template <class ClockT = mono_clock>
+class time_guard {
+  const typename ClockT::time_point start;
+  timespan& diff_accumulator;
+
+public:
+  time_guard(timespan& diff_accumulator)
+    : start(ClockT::now()),
+      diff_accumulator(diff_accumulator) {
+  }
+  ~time_guard() {
+    diff_accumulator += ClockT::now() - start;
+  }
+};
+
 namespace time_detail {
 // So that our subtractions produce negative spans rather than
 // arithmetic underflow.
diff --git a/src/common/io_exerciser/CMakeLists.txt b/src/common/io_exerciser/CMakeLists.txt
index 07091df86e1..ab2e64fc222 100644
--- a/src/common/io_exerciser/CMakeLists.txt
+++ b/src/common/io_exerciser/CMakeLists.txt
@@ -5,9 +5,11 @@ add_library(object_io_exerciser STATIC
   Model.cc
   ObjectModel.cc
   RadosIo.cc
+  EcIoSequence.cc
 )
 
 target_link_libraries(object_io_exerciser
-  librados 
+  librados
   global
+  json_structures
 )
 \ No newline at end of file
diff --git a/src/common/io_exerciser/DataGenerator.cc b/src/common/io_exerciser/DataGenerator.cc
index 9aa77eeb6e9..701c32fa9ec 100644
--- a/src/common/io_exerciser/DataGenerator.cc
+++ b/src/common/io_exerciser/DataGenerator.cc
@@ -2,32 +2,28 @@
 // vim: ts=8 sw=2 smarttab
 #include "DataGenerator.h"
 
-#include "ObjectModel.h"
+#include <chrono>
+#include <iostream>
+#include <stdexcept>
 
+#include "ObjectModel.h"
 #include "common/debug.h"
 #include "common/dout.h"
-
 #include "fmt/format.h"
 #include "fmt/ranges.h"
 
-#include <chrono>
-#include <iostream>
-#include <stdexcept>
-
 #define dout_subsys ceph_subsys_rados
 #define dout_context g_ceph_context
 
 using DataGenerator = ceph::io_exerciser::data_generation::DataGenerator;
-using SeededRandomGenerator = ceph::io_exerciser::data_generation
-                                ::SeededRandomGenerator;
-using HeaderedSeededRandomGenerator = ceph::io_exerciser::data_generation
-                                        ::HeaderedSeededRandomGenerator;
+using SeededRandomGenerator =
+    ceph::io_exerciser::data_generation ::SeededRandomGenerator;
+using HeaderedSeededRandomGenerator =
+    ceph::io_exerciser::data_generation ::HeaderedSeededRandomGenerator;
 
 std::unique_ptr<DataGenerator> DataGenerator::create_generator(
-    GenerationType generationType, const ObjectModel& model)
-{
-  switch(generationType)
-  {
+    GenerationType generationType, const ObjectModel& model) {
+  switch (generationType) {
     case GenerationType::SeededRandom:
       return std::make_unique<SeededRandomGenerator>(model);
     case GenerationType::HeaderedSeededRandom:
@@ -39,28 +35,25 @@ std::unique_ptr<DataGenerator> DataGenerator::create_generator(
   return nullptr;
 }
 
-bufferlist DataGenerator::generate_wrong_data(uint64_t offset, uint64_t length)
-{
+bufferlist DataGenerator::generate_wrong_data(uint64_t offset,
+                                              uint64_t length) {
   bufferlist retlist;
   uint64_t block_size = m_model.get_block_size();
   char buffer[block_size];
-  for (uint64_t block_offset = offset;
-       block_offset < offset + length;
-       block_offset++)
-  {
+  for (uint64_t block_offset = offset; block_offset < offset + length;
+       block_offset++) {
     std::memset(buffer, 0, block_size);
     retlist.append(ceph::bufferptr(buffer, block_size));
   }
   return retlist;
 }
 
-bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, uint64_t length)
-{
+bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset,
+                             uint64_t length) {
   return bufferlist.contents_equal(generate_data(offset, length));
 }
 
-ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset)
-{
+ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) {
   uint64_t block_size = m_model.get_block_size();
   char buffer[block_size];
 
@@ -70,29 +63,26 @@ ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset)
 
   constexpr size_t generation_length = sizeof(uint64_t);
 
-  for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--)
-  {
+  for (uint64_t i = 0; i < block_size;
+       i += (2 * generation_length), rand1++, rand2--) {
     std::memcpy(buffer + i, &rand1, generation_length);
     std::memcpy(buffer + i + generation_length, &rand2, generation_length);
   }
 
   size_t remainingBytes = block_size % (generation_length * 2);
-  if (remainingBytes > generation_length)
-  {
+  if (remainingBytes > generation_length) {
     size_t remainingBytes2 = remainingBytes - generation_length;
     std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
     std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2);
-  }
-  else if (remainingBytes > 0)
-  {
+  } else if (remainingBytes > 0) {
     std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
   }
 
   return ceph::bufferptr(buffer, block_size);
 }
 
-ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offset)
-{
+ceph::bufferptr SeededRandomGenerator::generate_wrong_block(
+    uint64_t block_offset) {
   uint64_t block_size = m_model.get_block_size();
   char buffer[block_size];
 
@@ -102,141 +92,134 @@ ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offse
 
   constexpr size_t generation_length = sizeof(uint64_t);
 
-  for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--)
-  {
+  for (uint64_t i = 0; i < block_size;
+       i += (2 * generation_length), rand1++, rand2--) {
     std::memcpy(buffer + i, &rand1, generation_length);
     std::memcpy(buffer + i + generation_length, &rand2, generation_length);
   }
 
   size_t remainingBytes = block_size % (generation_length * 2);
-  if (remainingBytes > generation_length)
-  {
+  if (remainingBytes > generation_length) {
     size_t remainingBytes2 = remainingBytes - generation_length;
     std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
     std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2);
-  }
-  else if (remainingBytes > 0)
-  {
+  } else if (remainingBytes > 0) {
     std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes);
   }
 
   return ceph::bufferptr(buffer, block_size);
 }
 
-bufferlist SeededRandomGenerator::generate_data(uint64_t offset, uint64_t length)
-{
+bufferlist SeededRandomGenerator::generate_data(uint64_t offset,
+                                                uint64_t length) {
   bufferlist retlist;
 
-  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
-  {
+  for (uint64_t block_offset = offset; block_offset < offset + length;
+       block_offset++) {
     retlist.append(generate_block(block_offset));
   }
 
   return retlist;
 }
 
-bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, uint64_t length)
-{
+bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset,
+                                                      uint64_t length) {
   bufferlist retlist;
 
-  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
-  {
+  for (uint64_t block_offset = offset; block_offset < offset + length;
+       block_offset++) {
     retlist.append(generate_wrong_block(block_offset));
   }
 
   return retlist;
 }
 
-HeaderedSeededRandomGenerator
-  ::HeaderedSeededRandomGenerator(const ObjectModel& model,
-                                  std::optional<uint64_t> unique_run_id) :
-    SeededRandomGenerator(model),
-    unique_run_id(unique_run_id.value_or(generate_unique_run_id()))
-{
-
-}
+HeaderedSeededRandomGenerator ::HeaderedSeededRandomGenerator(
+    const ObjectModel& model, std::optional<uint64_t> unique_run_id)
+    : SeededRandomGenerator(model),
+      unique_run_id(unique_run_id.value_or(generate_unique_run_id())) {}
 
-uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id()
-{
+uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id() {
   std::mt19937_64 random_generator =
-        std::mt19937_64(duration_cast<std::chrono::milliseconds>(
-          std::chrono::system_clock::now().time_since_epoch()).count());
+      std::mt19937_64(duration_cast<std::chrono::milliseconds>(
+                          std::chrono::system_clock::now().time_since_epoch())
+                          .count());
 
-      return random_generator();
+  return random_generator();
 }
 
-ceph::bufferptr HeaderedSeededRandomGenerator::generate_block(uint64_t block_offset)
-{
+ceph::bufferptr HeaderedSeededRandomGenerator::generate_block(
+    uint64_t block_offset) {
   SeedBytes seed = m_model.get_seed(block_offset);
-  TimeBytes current_time = duration_cast<std::chrono::milliseconds>(
-      std::chrono::system_clock::now().time_since_epoch()).count();
+  TimeBytes current_time =
+      duration_cast<std::chrono::milliseconds>(
+          std::chrono::system_clock::now().time_since_epoch())
+          .count();
 
-  ceph::bufferptr bufferptr = SeededRandomGenerator::generate_block(block_offset);
+  ceph::bufferptr bufferptr =
+      SeededRandomGenerator::generate_block(block_offset);
 
-  std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, uniqueIdLength());
+  std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id,
+              uniqueIdLength());
   std::memcpy(bufferptr.c_str() + seedStart(), &seed, seedLength());
   std::memcpy(bufferptr.c_str() + timeStart(), &current_time, timeLength());
 
   return bufferptr;
 }
 
-ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block(uint64_t block_offset)
-{
+ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block(
+    uint64_t block_offset) {
   return HeaderedSeededRandomGenerator::generate_block(block_offset % 8);
 }
 
 const HeaderedSeededRandomGenerator::UniqueIdBytes
-  HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset,
-                                                 const bufferlist& bufferlist)
-{
+HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset,
+                                               const bufferlist& bufferlist) {
   UniqueIdBytes read_unique_run_id = 0;
-  std::memcpy(&read_unique_run_id,
-              &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()],
-              uniqueIdLength());
+  std::memcpy(
+      &read_unique_run_id,
+      &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()],
+      uniqueIdLength());
   return read_unique_run_id;
 }
 
 const HeaderedSeededRandomGenerator::SeedBytes
-  HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset,
-                                          const bufferlist& bufferlist)
-{
+HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset,
+                                        const bufferlist& bufferlist) {
   SeedBytes read_seed = 0;
-  std::memcpy(&read_seed,
-              &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()],
-              seedLength());
+  std::memcpy(
+      &read_seed,
+      &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()],
+      seedLength());
   return read_seed;
 }
 
 const HeaderedSeededRandomGenerator::TimeBytes
-  HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset,
-                                              const bufferlist& bufferlist)
-{
+HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset,
+                                            const bufferlist& bufferlist) {
   TimeBytes read_time = 0;
-  std::memcpy(&read_time,
-              &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()],
-              timeLength());
+  std::memcpy(
+      &read_time,
+      &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()],
+      timeLength());
   return read_time;
 }
 
 bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist,
-                                             uint64_t offset, uint64_t length)
-{
+                                             uint64_t offset, uint64_t length) {
   std::vector<uint64_t> invalid_block_offsets;
 
-  for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++)
-  {
-    bool valid_block
-      = validate_block(block_offset,
-                       (bufferlist.c_str() + ((block_offset - offset) *
-                       m_model.get_block_size())));
-    if (!valid_block)
-    {
+  for (uint64_t block_offset = offset; block_offset < offset + length;
+       block_offset++) {
+    bool valid_block = validate_block(
+        block_offset, (bufferlist.c_str() +
+                       ((block_offset - offset) * m_model.get_block_size())));
+    if (!valid_block) {
       invalid_block_offsets.push_back(block_offset);
     }
   }
 
-  if (!invalid_block_offsets.empty())
-  {
+  if (!invalid_block_offsets.empty()) {
     printDebugInformationForOffsets(offset, invalid_block_offsets, bufferlist);
   }
 
@@ -244,59 +227,51 @@ bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist,
 }
 
 bool HeaderedSeededRandomGenerator::validate_block(uint64_t block_offset,
-                                                   const char* buffer_start)
-{
+                                                   const char* buffer_start) {
   // We validate the block matches what we generate byte for byte
   // however we ignore the time section of the header
   ceph::bufferptr bufferptr = generate_block(block_offset);
   bool valid = strncmp(bufferptr.c_str(), buffer_start, timeStart()) == 0;
-  valid = valid ? strncmp(bufferptr.c_str() + timeEnd(),
-                          buffer_start + timeEnd(),
-                          m_model.get_block_size() - timeEnd()) == 0 : valid;
+  valid = valid
+              ? strncmp(bufferptr.c_str() + timeEnd(), buffer_start + timeEnd(),
+                        m_model.get_block_size() - timeEnd()) == 0
+              : valid;
   return valid;
 }
 
 const HeaderedSeededRandomGenerator::ErrorType
-  HeaderedSeededRandomGenerator::getErrorTypeForBlock(uint64_t read_offset,
-                                                      uint64_t block_offset,
-                                                      const bufferlist& bufferlist)
-{
-  try
-  {
-    UniqueIdBytes read_unique_run_id = readUniqueRunId(block_offset - read_offset,
-                                                       bufferlist);
-    if (unique_run_id != read_unique_run_id)
-    {
+HeaderedSeededRandomGenerator::getErrorTypeForBlock(
+    uint64_t read_offset, uint64_t block_offset, const bufferlist& bufferlist) {
+  try {
+    UniqueIdBytes read_unique_run_id =
+        readUniqueRunId(block_offset - read_offset, bufferlist);
+    if (unique_run_id != read_unique_run_id) {
       return ErrorType::RUN_ID_MISMATCH;
     }
 
     SeedBytes read_seed = readSeed(block_offset - read_offset, bufferlist);
-    if (m_model.get_seed(block_offset) != read_seed)
-    {
+    if (m_model.get_seed(block_offset) != read_seed) {
       return ErrorType::SEED_MISMATCH;
     }
 
     if (std::strncmp(&bufferlist[((block_offset - read_offset) *
-                      m_model.get_block_size()) + bodyStart()],
+                                  m_model.get_block_size()) +
+                                 bodyStart()],
                      generate_block(block_offset).c_str() + bodyStart(),
-                     m_model.get_block_size() - bodyStart()) != 0)
-    {
+                     m_model.get_block_size() - bodyStart()) != 0) {
       return ErrorType::DATA_MISMATCH;
     }
-  }
-  catch(const std::exception& e)
-  {
+  } catch (const std::exception& e) {
     return ErrorType::DATA_NOT_FOUND;
   }
 
   return ErrorType::UNKNOWN;
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationForBlock(uint64_t read_offset, uint64_t block_offset,
-                                  const bufferlist& bufferlist)
-{
-  ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, bufferlist);
+void HeaderedSeededRandomGenerator ::printDebugInformationForBlock(
+    uint64_t read_offset, uint64_t block_offset, const bufferlist& bufferlist) {
+  ErrorType blockError =
+      getErrorTypeForBlock(read_offset, block_offset, bufferlist);
 
   TimeBytes read_time = 0;
   std::time_t ttp;
@@ -304,433 +279,361 @@ void HeaderedSeededRandomGenerator
   char read_bytes[m_model.get_block_size()];
   char generated_bytes[m_model.get_block_size()];
 
-  if (blockError == ErrorType::DATA_MISMATCH || blockError == ErrorType::UNKNOWN)
-  {
+  if (blockError == ErrorType::DATA_MISMATCH ||
+      blockError == ErrorType::UNKNOWN) {
     read_time = readDateTime(block_offset - read_offset, bufferlist);
-    std::chrono::system_clock::time_point time_point{std::chrono::milliseconds{read_time}};
+    std::chrono::system_clock::time_point time_point{
+        std::chrono::milliseconds{read_time}};
     ttp = std::chrono::system_clock::to_time_t(time_point);
 
-    std::memcpy(&read_bytes,
-                &bufferlist[((block_offset - read_offset) * m_model.get_block_size())],
-                m_model.get_block_size() - bodyStart());
-    std::memcpy(&generated_bytes,
-                generate_block(block_offset).c_str(),
+    std::memcpy(
+        &read_bytes,
+        &bufferlist[((block_offset - read_offset) * m_model.get_block_size())],
+        m_model.get_block_size() - bodyStart());
+    std::memcpy(&generated_bytes, generate_block(block_offset).c_str(),
                 m_model.get_block_size() - bodyStart());
   }
 
   std::string error_string;
-  switch(blockError)
-  {
-    case ErrorType::RUN_ID_MISMATCH:
-    {
-      UniqueIdBytes read_unique_run_id = readUniqueRunId((block_offset - read_offset),
-                                                          bufferlist);
-      error_string = fmt::format("Header (Run ID) mismatch detected at block {} "
-        "(byte offset {}) Header expected run id {} but found id {}. "
-        "Block data corrupt or not written from this instance of this application.",
-      block_offset,
-      block_offset * m_model.get_block_size(),
-      unique_run_id,
-      read_unique_run_id);
-    }
-    break;
-
-    case ErrorType::SEED_MISMATCH:
-    {
+  switch (blockError) {
+    case ErrorType::RUN_ID_MISMATCH: {
+      UniqueIdBytes read_unique_run_id =
+          readUniqueRunId((block_offset - read_offset), bufferlist);
+      error_string = fmt::format(
+          "Header (Run ID) mismatch detected at block {} "
+          "(byte offset {}) Header expected run id {} but found id {}. "
+          "Block data corrupt or not written from this instance of this "
+          "application.",
+          block_offset, block_offset * m_model.get_block_size(), unique_run_id,
+          read_unique_run_id);
+    } break;
+
+    case ErrorType::SEED_MISMATCH: {
       SeedBytes read_seed = readSeed((block_offset - read_offset), bufferlist);
 
-      if (m_model.get_seed_offsets(read_seed).size() == 0)
-      {
-        error_string = fmt::format("Data (Seed) mismatch detected at block {}"
-          " (byte offset {}). Header expected seed {} but found seed {}. "
-          "Read data was not from any other recognised block in the object.",
-            block_offset,
-            block_offset * m_model.get_block_size(),
-            m_model.get_seed(block_offset),
-            read_seed);
-      }
-      else
-      {
+      if (m_model.get_seed_offsets(read_seed).size() == 0) {
+        error_string = fmt::format(
+            "Data (Seed) mismatch detected at block {}"
+            " (byte offset {}). Header expected seed {} but found seed {}. "
+            "Read data was not from any other recognised block in the object.",
+            block_offset, block_offset * m_model.get_block_size(),
+            m_model.get_seed(block_offset), read_seed);
+      } else {
         std::vector<int> seed_offsets = m_model.get_seed_offsets(read_seed);
-        error_string = fmt::format("Data (Seed) mismatch detected at block {}"
-          " (byte offset {}). Header expected seed {} but found seed {}."
-          " Read data was from a different block(s): {}",
-            block_offset,
-            block_offset * m_model.get_block_size(),
-            m_model.get_seed(block_offset),
-            read_seed,
+        error_string = fmt::format(
+            "Data (Seed) mismatch detected at block {}"
+            " (byte offset {}). Header expected seed {} but found seed {}."
+            " Read data was from a different block(s): {}",
+            block_offset, block_offset * m_model.get_block_size(),
+            m_model.get_seed(block_offset), read_seed,
             fmt::join(seed_offsets.begin(), seed_offsets.end(), ""));
       }
-    }
-    break;
-
-    case ErrorType::DATA_MISMATCH:
-    {
-      error_string = fmt::format("Data (Body) mismatch detected at block {}"
-        " (byte offset {}). Header data matches, data body does not."
-        " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}",
-          block_offset,
-          block_offset * m_model.get_block_size(),
+    } break;
+
+    case ErrorType::DATA_MISMATCH: {
+      error_string = fmt::format(
+          "Data (Body) mismatch detected at block {}"
+          " (byte offset {}). Header data matches, data body does not."
+          " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}",
+          block_offset, block_offset * m_model.get_block_size(),
           std::ctime(&ttp),
-          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""),
+          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(),
+                    ""),
           fmt::join(read_bytes, read_bytes + m_model.get_block_size(), ""));
-    }
-    break;
+    } break;
 
-    case ErrorType::DATA_NOT_FOUND:
-    {
+    case ErrorType::DATA_NOT_FOUND: {
       uint64_t bufferlist_length = bufferlist.to_str().size();
-      error_string = fmt::format("Data (Body) could not be read at block {}"
-        " (byte offset {}) offset in bufferlist returned from read: {}"
-        " ({} bytes). Returned bufferlist length: {}.",
-          block_offset,
-          block_offset * m_model.get_block_size(),
+      error_string = fmt::format(
+          "Data (Body) could not be read at block {}"
+          " (byte offset {}) offset in bufferlist returned from read: {}"
+          " ({} bytes). Returned bufferlist length: {}.",
+          block_offset, block_offset * m_model.get_block_size(),
           (block_offset - read_offset),
           (block_offset - read_offset) * m_model.get_block_size(),
           bufferlist_length);
-    }
-    break;
+    } break;
 
     case ErrorType::UNKNOWN:
-      [[ fallthrough ]];
-
-    default:
-    {
-      error_string = fmt::format("Data mismatch detected at block {}"
-        " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}",
-          block_offset,
-          block_offset * m_model.get_block_size(),
-          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""),
+      [[fallthrough]];
+
+    default: {
+      error_string = fmt::format(
+          "Data mismatch detected at block {}"
+          " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}",
+          block_offset, block_offset * m_model.get_block_size(),
+          fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(),
+                    ""),
           fmt::join(read_bytes, read_bytes + m_model.get_block_size(), ""));
-    }
-    break;
+    } break;
   }
   dout(0) << error_string << dendl;
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationForRange(uint64_t read_offset,
-                                  uint64_t start_block_offset,
-                                  uint64_t range_length_in_blocks,
-                                  ErrorType rangeError,
-                                  const bufferlist& bufferlist)
-{
-  switch(rangeError)
-  {
-  case ErrorType::RUN_ID_MISMATCH:
-    printDebugInformationForRunIdMismatchRange(read_offset, start_block_offset,
-                                               range_length_in_blocks, bufferlist);
-    break;
-  case ErrorType::SEED_MISMATCH:
-    printDebugInformationForSeedMismatchRange(read_offset, start_block_offset,
-                                              range_length_in_blocks, bufferlist);
-    break;
-  case ErrorType::DATA_MISMATCH:
-    printDebugInformationDataBodyMismatchRange(read_offset, start_block_offset,
-                                               range_length_in_blocks, bufferlist);
-    break;
-  case ErrorType::DATA_NOT_FOUND:
-    printDebugInformationDataNotFoundRange(read_offset, start_block_offset,
-                                           range_length_in_blocks, bufferlist);
-    break;
-  case ErrorType::UNKNOWN:
-    [[ fallthrough ]];
-  default:
-    printDebugInformationCorruptRange(read_offset, start_block_offset,
-                                      range_length_in_blocks, bufferlist);
-    break;
+void HeaderedSeededRandomGenerator ::printDebugInformationForRange(
+    uint64_t read_offset, uint64_t start_block_offset,
+    uint64_t range_length_in_blocks, ErrorType rangeError,
+    const bufferlist& bufferlist) {
+  switch (rangeError) {
+    case ErrorType::RUN_ID_MISMATCH:
+      printDebugInformationForRunIdMismatchRange(
+          read_offset, start_block_offset, range_length_in_blocks, bufferlist);
+      break;
+    case ErrorType::SEED_MISMATCH:
+      printDebugInformationForSeedMismatchRange(
+          read_offset, start_block_offset, range_length_in_blocks, bufferlist);
+      break;
+    case ErrorType::DATA_MISMATCH:
+      printDebugInformationDataBodyMismatchRange(
+          read_offset, start_block_offset, range_length_in_blocks, bufferlist);
+      break;
+    case ErrorType::DATA_NOT_FOUND:
+      printDebugInformationDataNotFoundRange(
+          read_offset, start_block_offset, range_length_in_blocks, bufferlist);
+      break;
+    case ErrorType::UNKNOWN:
+      [[fallthrough]];
+    default:
+      printDebugInformationCorruptRange(read_offset, start_block_offset,
+                                        range_length_in_blocks, bufferlist);
+      break;
   }
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationForRunIdMismatchRange(uint64_t read_offset,
-                                               uint64_t start_block_offset,
-                                               uint64_t range_length_in_blocks,
-                                               const bufferlist& bufferlist)
-{
+void HeaderedSeededRandomGenerator ::printDebugInformationForRunIdMismatchRange(
+    uint64_t read_offset, uint64_t start_block_offset,
+    uint64_t range_length_in_blocks, const bufferlist& bufferlist) {
   uint64_t range_start = start_block_offset;
   uint64_t range_length = 0;
-  UniqueIdBytes initial_read_unique_run_id = readUniqueRunId(start_block_offset - read_offset,
-                                                             bufferlist);
+  UniqueIdBytes initial_read_unique_run_id =
+      readUniqueRunId(start_block_offset - read_offset, bufferlist);
   for (uint64_t i = start_block_offset;
-       i < start_block_offset + range_length_in_blocks; i++)
-  {
-    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist)
-                == ErrorType::RUN_ID_MISMATCH);
+       i < start_block_offset + range_length_in_blocks; i++) {
+    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) ==
+                ErrorType::RUN_ID_MISMATCH);
 
-    UniqueIdBytes read_unique_run_id = readUniqueRunId(i - read_offset, bufferlist);
+    UniqueIdBytes read_unique_run_id =
+        readUniqueRunId(i - read_offset, bufferlist);
     if (initial_read_unique_run_id != read_unique_run_id ||
-        i == (start_block_offset + range_length_in_blocks - 1))
-    {
-      if (range_length == 1)
-      {
+        i == (start_block_offset + range_length_in_blocks - 1)) {
+      if (range_length == 1) {
         printDebugInformationForBlock(read_offset, i, bufferlist);
-      }
-      else if (range_length > 1)
-      {
-        dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {} ({} bytes)"
-                    " and spanning a range of {} blocks ({} bytes). "
-                    "Expected run id {} for range but found id {}"
-                    " for all blocks in range. "
-                    "Block data corrupt or not written from this instance of this application.",
-                      range_start,
-                      range_start * m_model.get_block_size(),
-                      range_length,
-                      range_length * m_model.get_block_size(),
-                      unique_run_id,
-                      initial_read_unique_run_id) << dendl;
+      } else if (range_length > 1) {
+        dout(0)
+            << fmt::format(
+                   "Data (Run ID) Mismatch detected from block {} ({} bytes)"
+                   " and spanning a range of {} blocks ({} bytes). "
+                   "Expected run id {} for range but found id {}"
+                   " for all blocks in range. "
+                   "Block data corrupt or not written from this instance of "
+                   "this application.",
+                   range_start, range_start * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size(),
+                   unique_run_id, initial_read_unique_run_id)
+            << dendl;
       }
 
       range_start = i;
       range_length = 1;
       initial_read_unique_run_id = read_unique_run_id;
-    }
-    else
-    {
+    } else {
       range_length++;
     }
   }
 
-  if (range_length == 1)
-  {
-    printDebugInformationForBlock(read_offset,
-                                  start_block_offset + range_length_in_blocks - 1,
-                                  bufferlist);
-  }
-  else if (range_length > 1)
-  {
-    dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {}"
-                " ({} bytes) and spanning a range of {} blocks ({} bytes). "
-                "Expected run id {} for range but found id for all blocks in range. "
-                "Block data corrupt or not written from this instance of this application.",
-                  range_start,
-                  range_start * m_model.get_block_size(),
-                  range_length,
-                  range_length * m_model.get_block_size(),
-                  unique_run_id,
-                  initial_read_unique_run_id)
+  if (range_length == 1) {
+    printDebugInformationForBlock(
+        read_offset, start_block_offset + range_length_in_blocks - 1,
+        bufferlist);
+  } else if (range_length > 1) {
+    dout(0) << fmt::format(
+                   "Data (Run ID) Mismatch detected from block {}"
+                   " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                   "Expected run id {} for range but found id for all blocks "
+                   "in range. "
+                   "Block data corrupt or not written from this instance of "
+                   "this application.",
+                   range_start, range_start * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size(),
+                   unique_run_id, initial_read_unique_run_id)
             << dendl;
   }
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationForSeedMismatchRange(uint64_t read_offset,
-                                              uint64_t start_block_offset,
-                                              uint64_t range_length_in_blocks,
-                                              const bufferlist& bufferlist)
-{
+void HeaderedSeededRandomGenerator ::printDebugInformationForSeedMismatchRange(
+    uint64_t read_offset, uint64_t start_block_offset,
+    uint64_t range_length_in_blocks, const bufferlist& bufferlist) {
   uint64_t range_start = start_block_offset;
   uint64_t range_length = 0;
 
   // Assert here if needed, as we can't support values
   // that can't be converted to a signed integer.
-  ceph_assert(m_model.get_block_size() < (std::numeric_limits<uint64_t>::max() / 2));
+  ceph_assert(m_model.get_block_size() <
+              (std::numeric_limits<uint64_t>::max() / 2));
   std::optional<int64_t> range_offset = 0;
 
   for (uint64_t i = start_block_offset;
-       i < start_block_offset + range_length_in_blocks; i++)
-  {
-    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist)
-                == ErrorType::SEED_MISMATCH);
+       i < start_block_offset + range_length_in_blocks; i++) {
+    ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) ==
+                ErrorType::SEED_MISMATCH);
     SeedBytes read_seed = readSeed(i - read_offset, bufferlist);
 
     std::vector<int> seed_found_offsets = m_model.get_seed_offsets(read_seed);
 
     if ((seed_found_offsets.size() == 1 &&
-        (static_cast<int64_t>(seed_found_offsets.front() - i) == range_offset)) ||
-        range_length == 0)
-    {
-      if (range_length == 0)
-      {
+         (static_cast<int64_t>(seed_found_offsets.front() - i) ==
+          range_offset)) ||
+        range_length == 0) {
+      if (range_length == 0) {
         range_start = i;
-        if (seed_found_offsets.size() > 0)
-        {
+        if (seed_found_offsets.size() > 0) {
           range_offset = seed_found_offsets.front() - i;
-        }
-        else
-        {
+        } else {
           range_offset = std::nullopt;
         }
       }
       range_length++;
-    }
-    else
-    {
-      if (range_length == 1)
-      {
+    } else {
+      if (range_length == 1) {
         printDebugInformationForBlock(read_offset, i - 1, bufferlist);
-      }
-      else if (range_length > 1 && range_offset.has_value())
-      {
-        dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}"
-                    " ({} bytes) and spanning a range of {} blocks ({} bytes). "
-                    "Returned data located starting from block {} ({} bytes) "
-                    "and spanning a range of {} blocks ({} bytes).",
-                      range_start,
-                      range_start * m_model.get_block_size(),
-                      range_length, range_length * m_model.get_block_size(),
-                      static_cast<uint64_t>(*range_offset) + range_start,
-                      (static_cast<uint64_t>(*range_offset) + range_start)
-                        * m_model.get_block_size(),
-                      range_length,
-                      range_length * m_model.get_block_size())
-                << dendl;
-      }
-      else
-      {
-        dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}"
-                    " ({} bytes) and spanning a range of {} blocks ({} bytes). "
-                    "Data seed mismatch spanning a range of {} blocks ({} bytes).",
-                      range_start,
-                      range_start * m_model.get_block_size(),
-                      range_length, range_length * m_model.get_block_size(),
-                      range_length,
-                      range_length * m_model.get_block_size())
-                << dendl;
+      } else if (range_length > 1 && range_offset.has_value()) {
+        dout(0)
+            << fmt::format(
+                   "Data (Seed) Mismatch detected from block {}"
+                   " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                   "Returned data located starting from block {} ({} bytes) "
+                   "and spanning a range of {} blocks ({} bytes).",
+                   range_start, range_start * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size(),
+                   static_cast<uint64_t>(*range_offset) + range_start,
+                   (static_cast<uint64_t>(*range_offset) + range_start) *
+                       m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size())
+            << dendl;
+      } else {
+        dout(0)
+            << fmt::format(
+                   "Data (Seed) Mismatch detected from block {}"
+                   " ({} bytes) and spanning a range of {} blocks ({} bytes). "
+                   "Data seed mismatch spanning a range of {} blocks ({} "
+                   "bytes).",
+                   range_start, range_start * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size())
+            << dendl;
       }
       range_length = 1;
       range_start = i;
-      if (seed_found_offsets.size() > 0)
-      {
+      if (seed_found_offsets.size() > 0) {
         range_offset = seed_found_offsets.front() - i;
-      }
-      else
-      {
+      } else {
         range_offset = std::nullopt;
       }
     }
   }
 
-  if (range_length == 1)
-  {
-    printDebugInformationForBlock(read_offset,
-                                  start_block_offset + range_length_in_blocks - 1,
-                                  bufferlist);
-  }
-  else if (range_length > 1 && range_offset.has_value())
-  {
-    dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) "
-                "and spanning a range of {} blocks ({} bytes). "
-                "Returned data located starting from block {} ({} bytes) "
-                "and spanning a range of {} blocks ({} bytes).",
-                  range_start,
-                  range_start * m_model.get_block_size(),
-                  range_length,
-                  range_length * m_model.get_block_size(),
-                  *range_offset + range_start,
-                  (*range_offset + range_start) * m_model.get_block_size(),
-                  range_length,
-                  range_length * m_model.get_block_size())
+  if (range_length == 1) {
+    printDebugInformationForBlock(
+        read_offset, start_block_offset + range_length_in_blocks - 1,
+        bufferlist);
+  } else if (range_length > 1 && range_offset.has_value()) {
+    dout(0) << fmt::format(
+                   "Data (Seed) Mismatch detected from block {} ({} bytes) "
+                   "and spanning a range of {} blocks ({} bytes). "
+                   "Returned data located starting from block {} ({} bytes) "
+                   "and spanning a range of {} blocks ({} bytes).",
+                   range_start, range_start * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size(),
+                   *range_offset + range_start,
+                   (*range_offset + range_start) * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size())
             << dendl;
-  }
-  else
-  {
-    dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) "
-                "and spanning a range of {} blocks ({} bytes). "
-                "and spanning a range of {} blocks ({} bytes).",
-                  range_start,
-                  range_start * m_model.get_block_size(),
-                  range_length,
-                  range_length * m_model.get_block_size(),
-                  range_length,
-                  range_length * m_model.get_block_size())
+  } else {
+    dout(0) << fmt::format(
+                   "Data (Seed) Mismatch detected from block {} ({} bytes) "
+                   "and spanning a range of {} blocks ({} bytes). "
+                   "and spanning a range of {} blocks ({} bytes).",
+                   range_start, range_start * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size(),
+                   range_length, range_length * m_model.get_block_size())
             << dendl;
   }
 }
 
-void HeaderedSeededRandomGenerator
-::printDebugInformationDataBodyMismatchRange(uint64_t read_offset,
-                                             uint64_t start_block_offset,
-                                             uint64_t range_length_in_blocks,
-                                             const bufferlist& bufferlist)
-{
-  dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. "
-              "Headers look as expected for range, "
-              "but generated data body does not match. "
-              "More information given for individual blocks below.",
-                start_block_offset,
-                start_block_offset + range_length_in_blocks - 1)
+void HeaderedSeededRandomGenerator ::printDebugInformationDataBodyMismatchRange(
+    uint64_t read_offset, uint64_t start_block_offset,
+    uint64_t range_length_in_blocks, const bufferlist& bufferlist) {
+  dout(0) << fmt::format(
+                 "Data Mismatch detected in blocks from {} to {}. "
+                 "Headers look as expected for range, "
+                 "but generated data body does not match. "
+                 "More information given for individual blocks below.",
+                 start_block_offset,
+                 start_block_offset + range_length_in_blocks - 1)
           << dendl;
 
   for (uint64_t i = start_block_offset;
-       i < start_block_offset + range_length_in_blocks; i++)
-  {
+       i < start_block_offset + range_length_in_blocks; i++) {
     printDebugInformationForBlock(read_offset, i, bufferlist);
   }
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationCorruptRange(uint64_t read_offset,
-                                      uint64_t start_block_offset,
-                                      uint64_t range_length_in_blocks,
-                                      const bufferlist& bufferlist)
-{
-  dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. "
-              "Headers look as expected for range, "
-              "but generated data body does not match. "
-              "More information given for individual blocks below.",
-                start_block_offset,
-                start_block_offset + range_length_in_blocks - 1)
+void HeaderedSeededRandomGenerator ::printDebugInformationCorruptRange(
+    uint64_t read_offset, uint64_t start_block_offset,
+    uint64_t range_length_in_blocks, const bufferlist& bufferlist) {
+  dout(0) << fmt::format(
+                 "Data Mismatch detected in blocks from {} to {}. "
+                 "Headers look as expected for range, "
+                 "but generated data body does not match. "
+                 "More information given for individual blocks below.",
+                 start_block_offset,
+                 start_block_offset + range_length_in_blocks - 1)
           << dendl;
 
   for (uint64_t i = start_block_offset;
-       i < start_block_offset + range_length_in_blocks; i++)
-  {
+       i < start_block_offset + range_length_in_blocks; i++) {
     printDebugInformationForBlock(read_offset, i, bufferlist);
   }
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationDataNotFoundRange(uint64_t read_offset,
-                                           uint64_t start_block_offset,
-                                           uint64_t range_length_in_blocks,
-                                           const bufferlist& bufferlist)
-{
-  dout(0) << fmt::format("Data not found for blocks from {} to {}. "
-              "More information given for individual blocks below.",
-                start_block_offset,
-                start_block_offset + range_length_in_blocks - 1)
+void HeaderedSeededRandomGenerator ::printDebugInformationDataNotFoundRange(
+    uint64_t read_offset, uint64_t start_block_offset,
+    uint64_t range_length_in_blocks, const bufferlist& bufferlist) {
+  dout(0) << fmt::format(
+                 "Data not found for blocks from {} to {}. "
+                 "More information given for individual blocks below.",
+                 start_block_offset,
+                 start_block_offset + range_length_in_blocks - 1)
           << dendl;
 
-  for (uint64_t i = start_block_offset; i < start_block_offset + range_length_in_blocks; i++)
-  {
+  for (uint64_t i = start_block_offset;
+       i < start_block_offset + range_length_in_blocks; i++) {
     printDebugInformationForBlock(read_offset, i, bufferlist);
   }
 }
 
-void HeaderedSeededRandomGenerator
-  ::printDebugInformationForOffsets(uint64_t read_offset,
-                                    std::vector<uint64_t> offsets,
-                                    const bufferlist& bufferlist)
-{
+void HeaderedSeededRandomGenerator ::printDebugInformationForOffsets(
+    uint64_t read_offset, std::vector<uint64_t> offsets,
+    const bufferlist& bufferlist) {
   uint64_t range_start = 0;
   uint64_t range_length = 0;
   ErrorType rangeError = ErrorType::UNKNOWN;
 
-  for (const uint64_t& block_offset : offsets)
-  {
-    ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset,
-                                                bufferlist);
+  for (const uint64_t& block_offset : offsets) {
+    ErrorType blockError =
+        getErrorTypeForBlock(read_offset, block_offset, bufferlist);
 
-    if (range_start == 0 && range_length == 0)
-    {
+    if (range_start == 0 && range_length == 0) {
       range_start = block_offset;
       range_length = 1;
       rangeError = blockError;
-    }
-    else if (blockError == rangeError &&
-             range_start + range_length == block_offset)
-{
+    } else if (blockError == rangeError &&
+               range_start + range_length == block_offset) {
       range_length++;
-    }
-    else
-    {
-      if (range_length == 1)
-      {
+    } else {
+      if (range_length == 1) {
         printDebugInformationForBlock(read_offset, range_start, bufferlist);
-      }
-      else if (range_length > 1)
-      {
+      } else if (range_length > 1) {
         printDebugInformationForRange(read_offset, range_start, range_length,
                                       rangeError, bufferlist);
       }
@@ -741,12 +644,9 @@ void HeaderedSeededRandomGenerator
     }
   }
 
-  if (range_length == 1)
-  {
+  if (range_length == 1) {
     printDebugInformationForBlock(read_offset, range_start, bufferlist);
-  }
-  else if (range_length > 1)
-  {
+  } else if (range_length > 1) {
     printDebugInformationForRange(read_offset, range_start, range_length,
                                   rangeError, bufferlist);
   }
diff --git a/src/common/io_exerciser/DataGenerator.h b/src/common/io_exerciser/DataGenerator.h
index 1e5784a54cc..c497c78ed61 100644
--- a/src/common/io_exerciser/DataGenerator.h
+++ b/src/common/io_exerciser/DataGenerator.h
@@ -3,8 +3,8 @@
 #include <memory>
 #include <random>
 
-#include "include/buffer.h"
 #include "ObjectModel.h"
+#include "include/buffer.h"
 
 /* Overview
  *
@@ -23,149 +23,139 @@
  *
  * class HeaderedSeededRandomGenerator
  *   Inherits from SeededDataGenerator. Generates entirely random patterns
- *   based on the seed retrieved by the model, however also appends a 
+ *   based on the seed retrieved by the model, however also appends a
  *   header to the start of each block. This generator also provides
  *   a range of verbose debug options to help disagnose a miscompare
  *   whenever it detects unexpected data.
  */
 
 namespace ceph {
-  namespace io_exerciser {
-    namespace data_generation {
-      enum class GenerationType {
-        SeededRandom,
-        HeaderedSeededRandom
-        // CompressedGenerator
-        // MixedGenerator
-      };
-
-      class DataGenerator {
-      public:
-        virtual ~DataGenerator() = default;
-        static std::unique_ptr<DataGenerator>
-          create_generator(GenerationType generatorType,
-                           const ObjectModel& model);
-        virtual bufferlist generate_data(uint64_t length, uint64_t offset)=0;
-        virtual bool validate(bufferlist& bufferlist, uint64_t offset,
-                              uint64_t length);
-
-        // Used for testing debug outputs from data generation
-        virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length);
-
-      protected:
-        const ObjectModel& m_model;
-
-        DataGenerator(const ObjectModel& model) : m_model(model) {}
-      };
-
-      class SeededRandomGenerator : public DataGenerator
-      {
-        public:
-          SeededRandomGenerator(const ObjectModel& model)
-            : DataGenerator(model) {}
-
-          virtual bufferptr generate_block(uint64_t offset);
-          virtual bufferlist generate_data(uint64_t length, uint64_t offset);
-          virtual bufferptr generate_wrong_block(uint64_t offset);
-          virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length) override;
-      };
-
-      class HeaderedSeededRandomGenerator : public SeededRandomGenerator
-      {
-        public:
-          HeaderedSeededRandomGenerator(const ObjectModel& model,
-                                        std::optional<uint64_t> unique_run_id = std::nullopt);
-
-          bufferptr generate_block(uint64_t offset) override;
-          bufferptr generate_wrong_block(uint64_t offset) override;
-          bool validate(bufferlist& bufferlist, uint64_t offset,
-                        uint64_t length) override;
-
-        private:
-          using UniqueIdBytes = uint64_t;
-          using SeedBytes = int;
-          using TimeBytes = uint64_t;
-
-          enum class ErrorType {
-            RUN_ID_MISMATCH,
-            SEED_MISMATCH,
-            DATA_MISMATCH,
-            DATA_NOT_FOUND,
-            UNKNOWN
-          };
-
-          constexpr uint8_t headerStart() const
-            { return 0; };
-          constexpr uint8_t uniqueIdStart() const
-            { return headerStart(); };
-          constexpr uint8_t uniqueIdLength() const
-            { return sizeof(UniqueIdBytes); };
-          constexpr uint8_t seedStart() const
-            { return uniqueIdStart() + uniqueIdLength(); };
-          constexpr uint8_t seedLength() const
-            { return sizeof(SeedBytes); };
-          constexpr uint8_t timeStart() const
-            { return seedStart() + seedLength(); };
-          constexpr uint8_t timeLength() const
-            { return sizeof(TimeBytes); };
-          constexpr uint8_t timeEnd() const
-            { return timeStart() + timeLength(); };
-          constexpr uint8_t headerLength() const
-            { return uniqueIdLength() + seedLength() + timeLength(); };
-          constexpr uint8_t bodyStart() const
-            { return headerStart() + headerLength(); };
-
-          const UniqueIdBytes readUniqueRunId(uint64_t block_offset,
-                                              const bufferlist& bufferlist);
-          const SeedBytes readSeed(uint64_t block_offset,
-                                   const bufferlist& bufferlist);
-          const TimeBytes readDateTime(uint64_t block_offset,
+namespace io_exerciser {
+namespace data_generation {
+enum class GenerationType {
+  SeededRandom,
+  HeaderedSeededRandom
+  // CompressedGenerator
+  // MixedGenerator
+};
+
+class DataGenerator {
+ public:
+  virtual ~DataGenerator() = default;
+  static std::unique_ptr<DataGenerator> create_generator(
+      GenerationType generatorType, const ObjectModel& model);
+  virtual bufferlist generate_data(uint64_t length, uint64_t offset) = 0;
+  virtual bool validate(bufferlist& bufferlist, uint64_t offset,
+                        uint64_t length);
+
+  // Used for testing debug outputs from data generation
+  virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length);
+
+ protected:
+  const ObjectModel& m_model;
+
+  DataGenerator(const ObjectModel& model) : m_model(model) {}
+};
+
+class SeededRandomGenerator : public DataGenerator {
+ public:
+  SeededRandomGenerator(const ObjectModel& model) : DataGenerator(model) {}
+
+  virtual bufferptr generate_block(uint64_t offset);
+  bufferlist generate_data(uint64_t length, uint64_t offset) override;
+  virtual bufferptr generate_wrong_block(uint64_t offset);
+  bufferlist generate_wrong_data(uint64_t offset,
+                                         uint64_t length) override;
+};
+
+class HeaderedSeededRandomGenerator : public SeededRandomGenerator {
+ public:
+  HeaderedSeededRandomGenerator(
+      const ObjectModel& model,
+      std::optional<uint64_t> unique_run_id = std::nullopt);
+
+  bufferptr generate_block(uint64_t offset) override;
+  bufferptr generate_wrong_block(uint64_t offset) override;
+  bool validate(bufferlist& bufferlist, uint64_t offset,
+                uint64_t length) override;
+
+ private:
+  using UniqueIdBytes = uint64_t;
+  using SeedBytes = int;
+  using TimeBytes = uint64_t;
+
+  enum class ErrorType {
+    RUN_ID_MISMATCH,
+    SEED_MISMATCH,
+    DATA_MISMATCH,
+    DATA_NOT_FOUND,
+    UNKNOWN
+  };
+
+  constexpr uint8_t headerStart() const { return 0; };
+  constexpr uint8_t uniqueIdStart() const { return headerStart(); };
+  constexpr uint8_t uniqueIdLength() const { return sizeof(UniqueIdBytes); };
+  constexpr uint8_t seedStart() const {
+    return uniqueIdStart() + uniqueIdLength();
+  };
+  constexpr uint8_t seedLength() const { return sizeof(SeedBytes); };
+  constexpr uint8_t timeStart() const { return seedStart() + seedLength(); };
+  constexpr uint8_t timeLength() const { return sizeof(TimeBytes); };
+  constexpr uint8_t timeEnd() const { return timeStart() + timeLength(); };
+  constexpr uint8_t headerLength() const {
+    return uniqueIdLength() + seedLength() + timeLength();
+  };
+  constexpr uint8_t bodyStart() const {
+    return headerStart() + headerLength();
+  };
+
+  const UniqueIdBytes readUniqueRunId(uint64_t block_offset,
+                                      const bufferlist& bufferlist);
+  const SeedBytes readSeed(uint64_t block_offset, const bufferlist& bufferlist);
+  const TimeBytes readDateTime(uint64_t block_offset,
+                               const bufferlist& bufferlist);
+
+  const UniqueIdBytes unique_run_id;
+
+  uint64_t generate_unique_run_id();
+
+  bool validate_block(uint64_t block_offset, const char* buffer_start);
+
+  const ErrorType getErrorTypeForBlock(uint64_t read_offset,
+                                       uint64_t block_offset,
                                        const bufferlist& bufferlist);
 
-          const UniqueIdBytes unique_run_id;
-
-          uint64_t generate_unique_run_id();
-
-          bool validate_block(uint64_t block_offset, const char* buffer_start);
-
-          const ErrorType getErrorTypeForBlock(uint64_t read_offset,
-                                               uint64_t block_offset,
-                                               const bufferlist& bufferlist);
-
-          void printDebugInformationForBlock(uint64_t read_offset,
-                                             uint64_t block_offset,
-                                             const bufferlist& bufferlist);
-          void printDebugInformationForRange(uint64_t read_offset,
-                                             uint64_t start_block_offset,
-                                             uint64_t range_length_in_blocks,
-                                             ErrorType rangeError,
-                                             const bufferlist& bufferlist);
-
-          void printDebugInformationForRunIdMismatchRange(uint64_t read_offset,
-                                                          uint64_t start_block_offset,
-                                                          uint64_t range_length_in_blocks,
-                                                          const bufferlist& bufferlist);
-          void printDebugInformationForSeedMismatchRange(uint64_t read_offset,
-                                                         uint64_t start_block_offset,
-                                                         uint64_t range_length_in_blocks,
-                                                         const bufferlist& bufferlist);
-          void printDebugInformationDataBodyMismatchRange(uint64_t read_offset,
-                                                          uint64_t start_block_offset,
-                                                          uint64_t range_length_in_blocks,
-                                                          const bufferlist& bufferlist);
-          void printDebugInformationDataNotFoundRange(uint64_t ßread_offset,
-                                                      uint64_t start_block_offset,
-                                                      uint64_t range_length_in_blocks,
-                                                      const bufferlist& bufferlist);
-          void printDebugInformationCorruptRange(uint64_t read_offset,
-                                                 uint64_t start_block_offset,
-                                                 uint64_t range_length_in_blocks,
-                                                 const bufferlist& bufferlist);
-
-          void printDebugInformationForOffsets(uint64_t read_offset,
-                                               std::vector<uint64_t> offsets,
-                                               const bufferlist& bufferlist);
-      };
-    }
-  }
-}
+  void printDebugInformationForBlock(uint64_t read_offset,
+                                     uint64_t block_offset,
+                                     const bufferlist& bufferlist);
+  void printDebugInformationForRange(uint64_t read_offset,
+                                     uint64_t start_block_offset,
+                                     uint64_t range_length_in_blocks,
+                                     ErrorType rangeError,
+                                     const bufferlist& bufferlist);
+
+  void printDebugInformationForRunIdMismatchRange(
+      uint64_t read_offset, uint64_t start_block_offset,
+      uint64_t range_length_in_blocks, const bufferlist& bufferlist);
+  void printDebugInformationForSeedMismatchRange(
+      uint64_t read_offset, uint64_t start_block_offset,
+      uint64_t range_length_in_blocks, const bufferlist& bufferlist);
+  void printDebugInformationDataBodyMismatchRange(
+      uint64_t read_offset, uint64_t start_block_offset,
+      uint64_t range_length_in_blocks, const bufferlist& bufferlist);
+  void printDebugInformationDataNotFoundRange(uint64_t ßread_offset,
+                                              uint64_t start_block_offset,
+                                              uint64_t range_length_in_blocks,
+                                              const bufferlist& bufferlist);
+  void printDebugInformationCorruptRange(uint64_t read_offset,
+                                         uint64_t start_block_offset,
+                                         uint64_t range_length_in_blocks,
+                                         const bufferlist& bufferlist);
+
+  void printDebugInformationForOffsets(uint64_t read_offset,
+                                       std::vector<uint64_t> offsets,
+                                       const bufferlist& bufferlist);
+};
+}  // namespace data_generation
+}  // namespace io_exerciser
+}  // namespace ceph
diff --git a/src/common/io_exerciser/EcIoSequence.cc b/src/common/io_exerciser/EcIoSequence.cc
new file mode 100644
index 00000000000..611920c96e0
--- /dev/null
+++ b/src/common/io_exerciser/EcIoSequence.cc
@@ -0,0 +1,267 @@
+#include "EcIoSequence.h"
+
+#include <memory>
+
+using IoOp = ceph::io_exerciser::IoOp;
+using Sequence = ceph::io_exerciser::Sequence;
+using IoSequence = ceph::io_exerciser::IoSequence;
+using EcIoSequence = ceph::io_exerciser::EcIoSequence;
+using ReadInjectSequence = ceph::io_exerciser::ReadInjectSequence;
+
+bool EcIoSequence::is_supported(Sequence sequence) const { return true; }
+
+std::unique_ptr<IoSequence> EcIoSequence::generate_sequence(
+    Sequence sequence, std::pair<int, int> obj_size_range, int k, int m,
+    int seed) {
+  switch (sequence) {
+    case Sequence::SEQUENCE_SEQ0:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ1:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ2:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ3:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ4:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ5:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ6:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ7:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ8:
+      [[fallthrough]];
+    case Sequence::SEQUENCE_SEQ9:
+      return std::make_unique<ReadInjectSequence>(obj_size_range, seed,
+                                                  sequence, k, m);
+    case Sequence::SEQUENCE_SEQ10:
+      return std::make_unique<Seq10>(obj_size_range, seed, k, m);
+    default:
+      ceph_abort_msg("Unrecognised sequence");
+  }
+}
+
+EcIoSequence::EcIoSequence(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed),
+      setup_inject(false),
+      clear_inject(false),
+      shard_to_inject(std::nullopt) {}
+
+void EcIoSequence::select_random_data_shard_to_inject_read_error(int k, int m) {
+  shard_to_inject = rng(k - 1);
+  setup_inject = true;
+}
+
+void EcIoSequence::select_random_data_shard_to_inject_write_error(int k,
+                                                                  int m) {
+  // Write errors do not support injecting to the primary OSD
+  shard_to_inject = rng(1, k - 1);
+  setup_inject = true;
+}
+
+void EcIoSequence::select_random_shard_to_inject_read_error(int k, int m) {
+  shard_to_inject = rng(k + m - 1);
+  setup_inject = true;
+}
+
+void EcIoSequence::select_random_shard_to_inject_write_error(int k, int m) {
+  // Write errors do not support injecting to the primary OSD
+  shard_to_inject = rng(1, k + m - 1);
+  setup_inject = true;
+}
+
+void EcIoSequence::generate_random_read_inject_type() {
+  inject_op_type = static_cast<InjectOpType>(
+      rng(static_cast<int>(InjectOpType::ReadEIO),
+          static_cast<int>(InjectOpType::ReadMissingShard)));
+}
+
+void EcIoSequence::generate_random_write_inject_type() {
+  inject_op_type = static_cast<InjectOpType>(
+      rng(static_cast<int>(InjectOpType::WriteFailAndRollback),
+          static_cast<int>(InjectOpType::WriteOSDAbort)));
+}
+
+ceph::io_exerciser::ReadInjectSequence::ReadInjectSequence(
+    std::pair<int, int> obj_size_range, int seed, Sequence s, int k, int m)
+    : EcIoSequence(obj_size_range, seed) {
+  child_sequence = IoSequence::generate_sequence(s, obj_size_range, seed);
+  select_random_data_shard_to_inject_read_error(k, m);
+  generate_random_read_inject_type();
+}
+
+Sequence ceph::io_exerciser::ReadInjectSequence::get_id() const {
+  return child_sequence->get_id();
+}
+
+std::string ceph::io_exerciser::ReadInjectSequence::get_name() const {
+  return child_sequence->get_name() +
+         " running with read errors injected on shard " +
+         std::to_string(*shard_to_inject);
+}
+
+std::unique_ptr<IoOp> ReadInjectSequence::next() {
+  step++;
+
+  if (nextOp) {
+    std::unique_ptr<IoOp> retOp = nullptr;
+    nextOp.swap(retOp);
+    return retOp;
+  }
+
+  std::unique_ptr<IoOp> childOp = child_sequence->next();
+
+  switch (childOp->getOpType()) {
+    case OpType::Remove:
+      nextOp.swap(childOp);
+      switch (inject_op_type) {
+        case InjectOpType::ReadEIO:
+          return ClearReadErrorInjectOp::generate(*shard_to_inject, 0);
+        case InjectOpType::ReadMissingShard:
+          return ClearReadErrorInjectOp::generate(*shard_to_inject, 1);
+        case InjectOpType::WriteFailAndRollback:
+          return ClearWriteErrorInjectOp::generate(*shard_to_inject, 0);
+        case InjectOpType::WriteOSDAbort:
+          return ClearWriteErrorInjectOp::generate(*shard_to_inject, 3);
+        case InjectOpType::None:
+          [[fallthrough]];
+        default:
+          ceph_abort_msg("Unsupported operation");
+      }
+      break;
+    case OpType::Create:
+      switch (inject_op_type) {
+        case InjectOpType::ReadEIO:
+          nextOp = InjectReadErrorOp::generate(
+              *shard_to_inject, 0, 0, std::numeric_limits<uint64_t>::max());
+          break;
+        case InjectOpType::ReadMissingShard:
+          nextOp = InjectReadErrorOp::generate(
+              *shard_to_inject, 1, 0, std::numeric_limits<uint64_t>::max());
+          break;
+        case InjectOpType::WriteFailAndRollback:
+          nextOp = InjectWriteErrorOp::generate(
+              *shard_to_inject, 0, 0, std::numeric_limits<uint64_t>::max());
+          break;
+        case InjectOpType::WriteOSDAbort:
+          nextOp = InjectWriteErrorOp::generate(
+              *shard_to_inject, 3, 0, std::numeric_limits<uint64_t>::max());
+          break;
+        case InjectOpType::None:
+          [[fallthrough]];
+        default:
+          ceph_abort_msg("Unsupported operation");
+      }
+      break;
+    default:
+      // Do nothing in default case
+      break;
+  }
+
+  return childOp;
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp>
+ceph::io_exerciser::ReadInjectSequence::_next() {
+  ceph_abort_msg(
+      "Should not reach this point, "
+      "this sequence should only consume complete sequences");
+
+  return DoneOp::generate();
+}
+
+ceph::io_exerciser::Seq10::Seq10(std::pair<int, int> obj_size_range, int seed,
+                                 int k, int m)
+    : EcIoSequence(obj_size_range, seed),
+      offset(0),
+      length(1),
+      inject_error_done(false),
+      failed_write_done(false),
+      read_done(false),
+      successful_write_done(false),
+      test_all_lengths(false),  // Only test length(1) due to time constraints
+      test_all_sizes(
+          false)  // Only test obj_size(rand()) due to time constraints
+{
+  select_random_shard_to_inject_write_error(k, m);
+  // We will inject specifically as part of our sequence in this sequence
+  setup_inject = false;
+  if (!test_all_sizes) {
+    select_random_object_size();
+  }
+}
+
+Sequence ceph::io_exerciser::Seq10::get_id() const {
+  return Sequence::SEQUENCE_SEQ10;
+}
+
+std::string ceph::io_exerciser::Seq10::get_name() const {
+  return "Sequential writes of length " + std::to_string(length) +
+         " with queue depth 1"
+         " first injecting a failed write and read it to ensure it rolls back, "
+         "then"
+         " successfully writing the data and reading the write the ensure it "
+         "is applied";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq10::_next() {
+  if (!inject_error_done) {
+    inject_error_done = true;
+    return InjectWriteErrorOp::generate(*shard_to_inject, 0, 0,
+                                        std::numeric_limits<uint64_t>::max());
+  } else if (!failed_write_done) {
+    failed_write_done = true;
+    read_done = false;
+    barrier = true;
+    return SingleFailedWriteOp::generate(offset, length);
+  } else if (failed_write_done && !read_done) {
+    read_done = true;
+    barrier = true;
+    return SingleReadOp::generate(offset, length);
+  } else if (!clear_inject_done) {
+    clear_inject_done = true;
+    return ClearWriteErrorInjectOp::generate(*shard_to_inject, 0);
+  } else if (!successful_write_done) {
+    successful_write_done = true;
+    read_done = false;
+    barrier = true;
+    return SingleWriteOp::generate(offset, length);
+  } else if (successful_write_done && !read_done) {
+    read_done = true;
+    return SingleReadOp::generate(offset, length);
+  } else if (successful_write_done && read_done) {
+    offset++;
+    inject_error_done = false;
+    failed_write_done = false;
+    read_done = false;
+    clear_inject_done = false;
+    successful_write_done = false;
+
+    if (offset + length >= obj_size) {
+      if (!test_all_lengths) {
+        remove = true;
+        done = true;
+        return BarrierOp::generate();
+      }
+
+      offset = 0;
+      length++;
+      if (length > obj_size) {
+        if (!test_all_sizes) {
+          remove = true;
+          done = true;
+          return BarrierOp::generate();
+        }
+
+        length = 1;
+        return increment_object_size();
+      }
+    }
+
+    return BarrierOp::generate();
+  } else {
+    ceph_abort_msg("Sequence in undefined state. Aborting");
+    return DoneOp::generate();
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/EcIoSequence.h b/src/common/io_exerciser/EcIoSequence.h
new file mode 100644
index 00000000000..37283b3906b
--- /dev/null
+++ b/src/common/io_exerciser/EcIoSequence.h
@@ -0,0 +1,65 @@
+#include "IoSequence.h"
+
+namespace ceph {
+namespace io_exerciser {
+class EcIoSequence : public IoSequence {
+ public:
+  virtual bool is_supported(Sequence sequence) const override;
+  static std::unique_ptr<IoSequence> generate_sequence(
+      Sequence s, std::pair<int, int> obj_size_range, int k, int m, int seed);
+
+ protected:
+  bool setup_inject;
+  bool clear_inject;
+  std::optional<uint64_t> shard_to_inject;
+  InjectOpType inject_op_type;
+
+  EcIoSequence(std::pair<int, int> obj_size_range, int seed);
+
+  // Writes cannot be sent to injected on shard zero, so selections seperated
+  // out
+  void select_random_data_shard_to_inject_read_error(int k, int m);
+  void select_random_data_shard_to_inject_write_error(int k, int m);
+  void select_random_shard_to_inject_read_error(int k, int m);
+  void select_random_shard_to_inject_write_error(int k, int m);
+  void generate_random_read_inject_type();
+  void generate_random_write_inject_type();
+};
+
+class ReadInjectSequence : public EcIoSequence {
+ public:
+  ReadInjectSequence(std::pair<int, int> obj_size_range, int seed, Sequence s,
+                     int k, int m);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  virtual std::unique_ptr<IoOp> next() override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  std::unique_ptr<IoSequence> child_sequence;
+  std::unique_ptr<IoOp> nextOp;
+};
+
+class Seq10 : public EcIoSequence {
+ public:
+  Seq10(std::pair<int, int> obj_size_range, int seed, int k, int m);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset;
+  uint64_t length;
+
+  bool inject_error_done;
+  bool failed_write_done;
+  bool read_done;
+  bool clear_inject_done;
+  bool successful_write_done;
+  bool test_all_lengths;
+  bool test_all_sizes;
+};
+}  // namespace io_exerciser
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoOp.cc b/src/common/io_exerciser/IoOp.cc
index cd855ba6fff..493d1f435b4 100644
--- a/src/common/io_exerciser/IoOp.cc
+++ b/src/common/io_exerciser/IoOp.cc
@@ -1,188 +1,316 @@
 #include "IoOp.h"
 
-using IoOp = ceph::io_exerciser::IoOp;
+#include "fmt/format.h"
+#include "include/ceph_assert.h"
 
-IoOp::IoOp( OpType op,
-            uint64_t offset1, uint64_t length1,
-            uint64_t offset2, uint64_t length2,
-            uint64_t offset3, uint64_t length3) :
-  op(op),
-  offset1(offset1), length1(length1),
-  offset2(offset2), length2(length2),
-  offset3(offset3), length3(length3)
-{
+using IoOp = ceph::io_exerciser::IoOp;
+using OpType = ceph::io_exerciser::OpType;
 
-}
+using DoneOp = ceph::io_exerciser::DoneOp;
+using BarrierOp = ceph::io_exerciser::BarrierOp;
+using CreateOp = ceph::io_exerciser::CreateOp;
+using RemoveOp = ceph::io_exerciser::RemoveOp;
+using SingleReadOp = ceph::io_exerciser::SingleReadOp;
+using DoubleReadOp = ceph::io_exerciser::DoubleReadOp;
+using TripleReadOp = ceph::io_exerciser::TripleReadOp;
+using SingleWriteOp = ceph::io_exerciser::SingleWriteOp;
+using DoubleWriteOp = ceph::io_exerciser::DoubleWriteOp;
+using TripleWriteOp = ceph::io_exerciser::TripleWriteOp;
+using SingleFailedWriteOp = ceph::io_exerciser::SingleFailedWriteOp;
+using DoubleFailedWriteOp = ceph::io_exerciser::DoubleFailedWriteOp;
+using TripleFailedWriteOp = ceph::io_exerciser::TripleFailedWriteOp;
 
-std::string IoOp::value_to_string(uint64_t v) const
-{
+namespace {
+std::string value_to_string(uint64_t v) {
   if (v < 1024 || (v % 1024) != 0) {
     return std::to_string(v);
-  }else if (v < 1024*1024 || (v % (1024 * 1024)) != 0 ) {
+  } else if (v < 1024 * 1024 || (v % (1024 * 1024)) != 0) {
     return std::to_string(v / 1024) + "K";
-  }else{
+  } else {
     return std::to_string(v / 1024 / 1024) + "M";
   }
 }
+}  // namespace
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_done() {
+IoOp::IoOp() {}
 
-    return std::make_unique<IoOp>(OpType::Done);
-}
+template <OpType opType>
+ceph::io_exerciser::TestOp<opType>::TestOp() : IoOp() {}
+
+DoneOp::DoneOp() : TestOp<OpType::Done>() {}
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_barrier() {
+std::string DoneOp::to_string(uint64_t block_size) const { return "Done"; }
 
-  return std::make_unique<IoOp>(OpType::BARRIER);
+std::unique_ptr<DoneOp> DoneOp::generate() {
+  return std::make_unique<DoneOp>();
 }
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_create(uint64_t size) {
+BarrierOp::BarrierOp() : TestOp<OpType::Barrier>() {}
 
-  return std::make_unique<IoOp>(OpType::CREATE,0,size);
+std::unique_ptr<BarrierOp> BarrierOp::generate() {
+  return std::make_unique<BarrierOp>();
 }
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_remove() {
-
-  return std::make_unique<IoOp>(OpType::REMOVE);
+std::string BarrierOp::to_string(uint64_t block_size) const {
+  return "Barrier";
 }
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_read(uint64_t offset, uint64_t length) {
+CreateOp::CreateOp(uint64_t size) : TestOp<OpType::Create>(), size(size) {}
 
-  return std::make_unique<IoOp>(OpType::READ, offset, length);
+std::unique_ptr<CreateOp> CreateOp::generate(uint64_t size) {
+  return std::make_unique<CreateOp>(size);
 }
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_read2(uint64_t offset1, uint64_t length1,
-                   uint64_t offset2, uint64_t length2) {
+std::string CreateOp::to_string(uint64_t block_size) const {
+  return "Create (size=" + value_to_string(size * block_size) + ")";
+}
 
-  if (offset1 < offset2) {
-    ceph_assert( offset1 + length1 <= offset2 );
-  } else {
-    ceph_assert( offset2 + length2 <= offset1 );
-  }
+RemoveOp::RemoveOp() : TestOp<OpType::Remove>() {}
 
-  return std::make_unique<IoOp>(OpType::READ2,
-                                offset1, length1,
-                                offset2, length2);
+std::unique_ptr<RemoveOp> RemoveOp::generate() {
+  return std::make_unique<RemoveOp>();
 }
 
-std::unique_ptr<IoOp> IoOp
-  ::generate_read3(uint64_t offset1, uint64_t length1,
-                   uint64_t offset2, uint64_t length2,
-                   uint64_t offset3, uint64_t length3) {
+std::string RemoveOp::to_string(uint64_t block_size) const { return "Remove"; }
 
-  if (offset1 < offset2) {
-    ceph_assert( offset1 + length1 <= offset2 );
-  } else {
-    ceph_assert( offset2 + length2 <= offset1 );
+template <OpType opType, int numIOs>
+ceph::io_exerciser::ReadWriteOp<opType, numIOs>::ReadWriteOp(
+    std::array<uint64_t, numIOs>&& offset,
+    std::array<uint64_t, numIOs>&& length)
+    : TestOp<opType>(), offset(offset), length(length) {
+  auto compare = [](uint64_t offset1, uint64_t length1, uint64_t offset2,
+                    uint64_t length2) {
+    if (offset1 < offset2) {
+      ceph_assert(offset1 + length1 <= offset2);
+    } else {
+      ceph_assert(offset2 + length2 <= offset1);
+    }
+  };
+
+  if (numIOs > 1) {
+    for (int i = 0; i < numIOs - 1; i++) {
+      for (int j = i + 1; j < numIOs; j++) {
+        compare(offset[i], length[i], offset[j], length[j]);
+      }
+    }
   }
-  if (offset1 < offset3) {
-    ceph_assert( offset1 + length1 <= offset3 );
-  } else {
-    ceph_assert( offset3 + length3 <= offset1 );
+}
+
+template <OpType opType, int numIOs>
+std::string ceph::io_exerciser::ReadWriteOp<opType, numIOs>::to_string(
+    uint64_t block_size) const {
+  std::string offset_length_desc;
+  if (numIOs > 0) {
+    offset_length_desc += fmt::format(
+        "offset1={}", value_to_string(this->offset[0] * block_size));
+    offset_length_desc += fmt::format(
+        ",length1={}", value_to_string(this->length[0] * block_size));
+    for (int i = 1; i < numIOs; i++) {
+      offset_length_desc += fmt::format(
+          ",offset{}={}", i + 1, value_to_string(this->offset[i] * block_size));
+      offset_length_desc += fmt::format(
+          ",length{}={}", i + 1, value_to_string(this->length[i] * block_size));
+    }
   }
-  if (offset2 < offset3) {
-    ceph_assert( offset2 + length2 <= offset3 );
-  } else {
-    ceph_assert( offset3 + length3 <= offset2 );
+  switch (opType) {
+    case OpType::Read:
+      [[fallthrough]];
+    case OpType::Read2:
+      [[fallthrough]];
+    case OpType::Read3:
+      return fmt::format("Read{} ({})", numIOs, offset_length_desc);
+    case OpType::Write:
+      [[fallthrough]];
+    case OpType::Write2:
+      [[fallthrough]];
+    case OpType::Write3:
+      return fmt::format("Write{} ({})", numIOs, offset_length_desc);
+    case OpType::FailedWrite:
+      [[fallthrough]];
+    case OpType::FailedWrite2:
+      [[fallthrough]];
+    case OpType::FailedWrite3:
+      return fmt::format("FailedWrite{} ({})", numIOs, offset_length_desc);
+    default:
+      ceph_abort_msg(
+          fmt::format("Unsupported op type by ReadWriteOp ({})", opType));
   }
-  return std::make_unique<IoOp>(OpType::READ3,
-                                offset1, length1,
-                                offset2, length2,
-                                offset3, length3);
 }
 
-std::unique_ptr<IoOp> IoOp::generate_write(uint64_t offset, uint64_t length) {
-  return std::make_unique<IoOp>(OpType::WRITE, offset, length);
+SingleReadOp::SingleReadOp(uint64_t offset, uint64_t length)
+    : ReadWriteOp<OpType::Read, 1>({offset}, {length}) {}
+
+std::unique_ptr<SingleReadOp> SingleReadOp::generate(uint64_t offset,
+                                                     uint64_t length) {
+  return std::make_unique<SingleReadOp>(offset, length);
 }
 
-std::unique_ptr<IoOp> IoOp::generate_write2(uint64_t offset1, uint64_t length1,
-                                            uint64_t offset2, uint64_t length2) {
-  if (offset1 < offset2) {
-    ceph_assert( offset1 + length1 <= offset2 );
-  } else {
-    ceph_assert( offset2 + length2 <= offset1 );
-  }
-  return std::make_unique<IoOp>(OpType::WRITE2,
-                                offset1, length1,
-                                offset2, length2);
+DoubleReadOp::DoubleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+                           uint64_t length2)
+    : ReadWriteOp<OpType::Read2, 2>({offset1, offset2}, {length1, length2}) {}
+
+std::unique_ptr<DoubleReadOp> DoubleReadOp::generate(uint64_t offset1,
+                                                     uint64_t length1,
+                                                     uint64_t offset2,
+                                                     uint64_t length2) {
+  return std::make_unique<DoubleReadOp>(offset1, length1, offset2, length2);
 }
 
-std::unique_ptr<IoOp> IoOp::generate_write3(uint64_t offset1, uint64_t length1, 
-                                            uint64_t offset2, uint64_t length2,
-                                            uint64_t offset3, uint64_t length3) {
-  if (offset1 < offset2) {
-    ceph_assert( offset1 + length1 <= offset2 );
-  } else {
-    ceph_assert( offset2 + length2 <= offset1 );
-  }
-  if (offset1 < offset3) {
-    ceph_assert( offset1 + length1 <= offset3 );
-  } else {
-    ceph_assert( offset3 + length3 <= offset1 );
-  }
-  if (offset2 < offset3) {
-    ceph_assert( offset2 + length2 <= offset3 );
-  } else {
-    ceph_assert( offset3 + length3 <= offset2 );
-  }
-  return std::make_unique<IoOp>(OpType::WRITE3,
-                                offset1, length1,
-                                offset2, length2,
-                                offset3, length3);
-}
-
-bool IoOp::done() {
-  return (op == OpType::Done);
-}
-
-std::string IoOp::to_string(uint64_t block_size) const
-{
-  switch (op) {
-  case OpType::Done:
-    return "Done";
-  case OpType::BARRIER:
-    return "Barrier";
-  case OpType::CREATE:
-    return "Create (size=" + value_to_string(length1 * block_size) + ")";
-  case OpType::REMOVE:
-    return "Remove";
-  case OpType::READ:
-    return "Read (offset=" + value_to_string(offset1 * block_size) +
-           ",length=" + value_to_string(length1 * block_size) + ")";
-  case OpType::READ2:
-    return "Read2 (offset1=" + value_to_string(offset1 * block_size) +
-           ",length1=" + value_to_string(length1 * block_size) +
-           ",offset2=" + value_to_string(offset2 * block_size) +
-           ",length2=" + value_to_string(length2 * block_size) + ")";
-  case OpType::READ3:
-    return "Read3 (offset1=" + value_to_string(offset1 * block_size) +
-           ",length1=" + value_to_string(length1 * block_size) +
-           ",offset2=" + value_to_string(offset2 * block_size) +
-           ",length2=" + value_to_string(length2 * block_size) +
-           ",offset3=" + value_to_string(offset3 * block_size) +
-           ",length3=" + value_to_string(length3 * block_size) + ")";
-  case OpType::WRITE:
-    return "Write (offset=" + value_to_string(offset1 * block_size) +
-           ",length=" + value_to_string(length1 * block_size) + ")";
-  case OpType::WRITE2:
-    return "Write2 (offset1=" + value_to_string(offset1 * block_size) +
-           ",length1=" + value_to_string(length1 * block_size) +
-           ",offset2=" + value_to_string(offset2 * block_size) +
-           ",length2=" + value_to_string(length2 * block_size) + ")";
-  case OpType::WRITE3:
-    return "Write3 (offset1=" + value_to_string(offset1 * block_size) +
-           ",length1=" + value_to_string(length1 * block_size) +
-           ",offset2=" + value_to_string(offset2 * block_size) +
-           ",length2=" + value_to_string(length2 * block_size) +
-           ",offset3=" + value_to_string(offset3 * block_size) +
-           ",length3=" + value_to_string(length3 * block_size) + ")";
-  default:
-    break;
-  }
-  return "Unknown";
+TripleReadOp::TripleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+                           uint64_t length2, uint64_t offset3, uint64_t length3)
+    : ReadWriteOp<OpType::Read3, 3>({offset1, offset2, offset3},
+                                    {length1, length2, length3}) {}
+
+std::unique_ptr<TripleReadOp> TripleReadOp::generate(
+    uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2,
+    uint64_t offset3, uint64_t length3) {
+  return std::make_unique<TripleReadOp>(offset1, length1, offset2, length2,
+                                        offset3, length3);
+}
+
+SingleWriteOp::SingleWriteOp(uint64_t offset, uint64_t length)
+    : ReadWriteOp<OpType::Write, 1>({offset}, {length}) {}
+
+std::unique_ptr<SingleWriteOp> SingleWriteOp::generate(uint64_t offset,
+                                                       uint64_t length) {
+  return std::make_unique<SingleWriteOp>(offset, length);
+}
+
+DoubleWriteOp::DoubleWriteOp(uint64_t offset1, uint64_t length1,
+                             uint64_t offset2, uint64_t length2)
+    : ReadWriteOp<OpType::Write2, 2>({offset1, offset2}, {length1, length2}) {}
+
+std::unique_ptr<DoubleWriteOp> DoubleWriteOp::generate(uint64_t offset1,
+                                                       uint64_t length1,
+                                                       uint64_t offset2,
+                                                       uint64_t length2) {
+  return std::make_unique<DoubleWriteOp>(offset1, length1, offset2, length2);
+}
+
+TripleWriteOp::TripleWriteOp(uint64_t offset1, uint64_t length1,
+                             uint64_t offset2, uint64_t length2,
+                             uint64_t offset3, uint64_t length3)
+    : ReadWriteOp<OpType::Write3, 3>({offset1, offset2, offset3},
+                                     {length1, length2, length3}) {}
+
+std::unique_ptr<TripleWriteOp> TripleWriteOp::generate(
+    uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2,
+    uint64_t offset3, uint64_t length3) {
+  return std::make_unique<TripleWriteOp>(offset1, length1, offset2, length2,
+                                         offset3, length3);
+}
+
+SingleFailedWriteOp::SingleFailedWriteOp(uint64_t offset, uint64_t length)
+    : ReadWriteOp<OpType::FailedWrite, 1>({offset}, {length}) {}
+
+std::unique_ptr<SingleFailedWriteOp> SingleFailedWriteOp::generate(
+    uint64_t offset, uint64_t length) {
+  return std::make_unique<SingleFailedWriteOp>(offset, length);
+}
+
+DoubleFailedWriteOp::DoubleFailedWriteOp(uint64_t offset1, uint64_t length1,
+                                         uint64_t offset2, uint64_t length2)
+    : ReadWriteOp<OpType::FailedWrite2, 2>({offset1, offset2},
+                                           {length1, length2}) {}
+
+std::unique_ptr<DoubleFailedWriteOp> DoubleFailedWriteOp::generate(
+    uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2) {
+  return std::make_unique<DoubleFailedWriteOp>(offset1, length1, offset2,
+                                               length2);
+}
+
+TripleFailedWriteOp::TripleFailedWriteOp(uint64_t offset1, uint64_t length1,
+                                         uint64_t offset2, uint64_t length2,
+                                         uint64_t offset3, uint64_t length3)
+    : ReadWriteOp<OpType::FailedWrite3, 3>({offset1, offset2, offset3},
+                                           {length1, length2, length3}) {}
+
+std::unique_ptr<TripleFailedWriteOp> TripleFailedWriteOp::generate(
+    uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2,
+    uint64_t offset3, uint64_t length3) {
+  return std::make_unique<TripleFailedWriteOp>(offset1, length1, offset2,
+                                               length2, offset3, length3);
+}
+
+template <ceph::io_exerciser::OpType opType>
+ceph::io_exerciser::InjectErrorOp<opType>::InjectErrorOp(
+    int shard, const std::optional<uint64_t>& type,
+    const std::optional<uint64_t>& when,
+    const std::optional<uint64_t>& duration)
+    : TestOp<opType>(),
+      shard(shard),
+      type(type),
+      when(when),
+      duration(duration) {}
+
+template <ceph::io_exerciser::OpType opType>
+std::string ceph::io_exerciser::InjectErrorOp<opType>::to_string(
+    uint64_t blocksize) const {
+  std::string_view inject_type = get_inject_type_string();
+  return fmt::format(
+      "Inject {} error on shard {} of type {}"
+      " after {} successful inject(s) lasting {} inject(s)",
+      inject_type, shard, type.value_or(0), when.value_or(0),
+      duration.value_or(1));
+}
+
+ceph::io_exerciser::InjectReadErrorOp::InjectReadErrorOp(
+    int shard, const std::optional<uint64_t>& type,
+    const std::optional<uint64_t>& when,
+    const std::optional<uint64_t>& duration)
+    : InjectErrorOp<OpType::InjectReadError>(shard, type, when, duration) {}
+
+std::unique_ptr<ceph::io_exerciser::InjectReadErrorOp>
+ceph::io_exerciser ::InjectReadErrorOp::generate(
+    int shard, const std::optional<uint64_t>& type,
+    const std::optional<uint64_t>& when,
+    const std::optional<uint64_t>& duration) {
+  return std::make_unique<InjectReadErrorOp>(shard, type, when, duration);
+}
+
+ceph::io_exerciser::InjectWriteErrorOp::InjectWriteErrorOp(
+    int shard, const std::optional<uint64_t>& type,
+    const std::optional<uint64_t>& when,
+    const std::optional<uint64_t>& duration)
+    : InjectErrorOp<OpType::InjectWriteError>(shard, type, when, duration) {}
+
+std::unique_ptr<ceph::io_exerciser::InjectWriteErrorOp>
+ceph::io_exerciser ::InjectWriteErrorOp::generate(
+    int shard, const std::optional<uint64_t>& type,
+    const std::optional<uint64_t>& when,
+    const std::optional<uint64_t>& duration) {
+  return std::make_unique<InjectWriteErrorOp>(shard, type, when, duration);
+}
+
+template <ceph::io_exerciser::OpType opType>
+ceph::io_exerciser::ClearErrorInjectOp<opType>::ClearErrorInjectOp(
+    int shard, const std::optional<uint64_t>& type)
+    : TestOp<opType>(), shard(shard), type(type) {}
+
+template <ceph::io_exerciser::OpType opType>
+std::string ceph::io_exerciser::ClearErrorInjectOp<opType>::to_string(
+    uint64_t blocksize) const {
+  std::string_view inject_type = get_inject_type_string();
+  return fmt::format("Clear {} injects on shard {} of type {}", inject_type,
+                     shard, type.value_or(0));
+}
+
+ceph::io_exerciser::ClearReadErrorInjectOp::ClearReadErrorInjectOp(
+    int shard, const std::optional<uint64_t>& type)
+    : ClearErrorInjectOp<OpType::ClearReadErrorInject>(shard, type) {}
+
+std::unique_ptr<ceph::io_exerciser::ClearReadErrorInjectOp>
+ceph::io_exerciser ::ClearReadErrorInjectOp::generate(
+    int shard, const std::optional<uint64_t>& type) {
+  return std::make_unique<ClearReadErrorInjectOp>(shard, type);
+}
+
+ceph::io_exerciser::ClearWriteErrorInjectOp::ClearWriteErrorInjectOp(
+    int shard, const std::optional<uint64_t>& type)
+    : ClearErrorInjectOp<OpType::ClearWriteErrorInject>(shard, type) {}
+
+std::unique_ptr<ceph::io_exerciser::ClearWriteErrorInjectOp>
+ceph::io_exerciser ::ClearWriteErrorInjectOp::generate(
+    int shard, const std::optional<uint64_t>& type) {
+  return std::make_unique<ClearWriteErrorInjectOp>(shard, type);
 }
 \ No newline at end of file
diff --git a/src/common/io_exerciser/IoOp.h b/src/common/io_exerciser/IoOp.h
index 60c02a93d4e..1887eafcc1f 100644
--- a/src/common/io_exerciser/IoOp.h
+++ b/src/common/io_exerciser/IoOp.h
@@ -1,94 +1,248 @@
 #pragma once
 
-#include <string>
+#include <array>
 #include <memory>
-#include "include/ceph_assert.h"
+#include <optional>
+#include <string>
+
+#include "OpType.h"
 
 /* Overview
  *
- * enum OpType
- *   Enumeration of different types of I/O operation
- *
  * class IoOp
  *   Stores details for an I/O operation. Generated by IoSequences
  *   and applied by IoExerciser's
  */
 
 namespace ceph {
-  namespace io_exerciser {
-
-    enum class OpType {
-      Done,       // End of I/O sequence
-      BARRIER,    // Barrier - all prior I/Os must complete
-      CREATE,     // Create object and pattern with data
-      REMOVE,     // Remove object
-      READ,       // Read
-      READ2,      // 2 Reads in one op
-      READ3,      // 3 Reads in one op
-      WRITE,      // Write
-      WRITE2,     // 2 Writes in one op
-      WRITE3      // 3 Writes in one op
-    };
-
-    class IoOp {
-    protected:
-      std::string value_to_string(uint64_t v) const;
-
-    public:
-      OpType op;
-      uint64_t offset1;
-      uint64_t length1;
-      uint64_t offset2;
-      uint64_t length2;
-      uint64_t offset3;
-      uint64_t length3;
-
-      IoOp( OpType op,
-            uint64_t offset1 = 0, uint64_t length1 = 0,
-            uint64_t offset2 = 0, uint64_t length2 = 0,
-            uint64_t offset3 = 0, uint64_t length3 = 0 );
-
-      static std::unique_ptr<IoOp> generate_done();
-
-      static std::unique_ptr<IoOp> generate_barrier();
-
-      static std::unique_ptr<IoOp> generate_create(uint64_t size);
-
-      static std::unique_ptr<IoOp> generate_remove();
-
-      static std::unique_ptr<IoOp> generate_read(uint64_t offset,
+namespace io_exerciser {
+
+class IoOp {
+ public:
+  IoOp();
+  virtual ~IoOp() = default;
+  virtual std::string to_string(uint64_t block_size) const = 0;
+  virtual constexpr OpType getOpType() const = 0;
+};
+
+template <OpType opType>
+class TestOp : public IoOp {
+ public:
+  TestOp();
+  constexpr OpType getOpType() const override { return opType; }
+};
+
+class DoneOp : public TestOp<OpType::Done> {
+ public:
+  DoneOp();
+  static std::unique_ptr<DoneOp> generate();
+  std::string to_string(uint64_t block_size) const override;
+};
+
+class BarrierOp : public TestOp<OpType::Barrier> {
+ public:
+  BarrierOp();
+  static std::unique_ptr<BarrierOp> generate();
+  std::string to_string(uint64_t block_size) const override;
+};
+
+class CreateOp : public TestOp<OpType::Create> {
+ public:
+  CreateOp(uint64_t size);
+  static std::unique_ptr<CreateOp> generate(uint64_t size);
+  std::string to_string(uint64_t block_size) const override;
+  uint64_t size;
+};
+
+class RemoveOp : public TestOp<OpType::Remove> {
+ public:
+  RemoveOp();
+  static std::unique_ptr<RemoveOp> generate();
+  std::string to_string(uint64_t block_size) const override;
+};
+
+template <OpType opType, int numIOs>
+class ReadWriteOp : public TestOp<opType> {
+ public:
+  std::array<uint64_t, numIOs> offset;
+  std::array<uint64_t, numIOs> length;
+
+ protected:
+  ReadWriteOp(std::array<uint64_t, numIOs>&& offset,
+              std::array<uint64_t, numIOs>&& length);
+  std::string to_string(uint64_t block_size) const override;
+};
+
+class SingleReadOp : public ReadWriteOp<OpType::Read, 1> {
+ public:
+  SingleReadOp(uint64_t offset, uint64_t length);
+  static std::unique_ptr<SingleReadOp> generate(uint64_t offset,
+                                                uint64_t length);
+};
+
+class DoubleReadOp : public ReadWriteOp<OpType::Read2, 2> {
+ public:
+  DoubleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+               uint64_t length2);
+  static std::unique_ptr<DoubleReadOp> generate(uint64_t offset1,
+                                                uint64_t length1,
+                                                uint64_t offset2,
+                                                uint64_t length2);
+};
+
+class TripleReadOp : public ReadWriteOp<OpType::Read3, 3> {
+ public:
+  TripleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+               uint64_t length2, uint64_t offset3, uint64_t length3);
+  static std::unique_ptr<TripleReadOp> generate(
+      uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2,
+      uint64_t offset3, uint64_t length3);
+};
+
+class SingleWriteOp : public ReadWriteOp<OpType::Write, 1> {
+ public:
+  SingleWriteOp(uint64_t offset, uint64_t length);
+  static std::unique_ptr<SingleWriteOp> generate(uint64_t offset,
                                                  uint64_t length);
+};
+
+class DoubleWriteOp : public ReadWriteOp<OpType::Write2, 2> {
+ public:
+  DoubleWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+                uint64_t length2);
+  static std::unique_ptr<DoubleWriteOp> generate(uint64_t offset1,
+                                                 uint64_t length1,
+                                                 uint64_t offset2,
+                                                 uint64_t length2);
+};
+
+class TripleWriteOp : public ReadWriteOp<OpType::Write3, 3> {
+ public:
+  TripleWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+                uint64_t length2, uint64_t offset3, uint64_t length3);
+  static std::unique_ptr<TripleWriteOp> generate(
+      uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2,
+      uint64_t offset3, uint64_t length3);
+};
+
+class SingleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite, 1> {
+ public:
+  SingleFailedWriteOp(uint64_t offset, uint64_t length);
+  static std::unique_ptr<SingleFailedWriteOp> generate(uint64_t offset,
+                                                       uint64_t length);
+};
+
+class DoubleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite2, 2> {
+ public:
+  DoubleFailedWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+                      uint64_t length2);
+  static std::unique_ptr<DoubleFailedWriteOp> generate(uint64_t offset1,
+                                                       uint64_t length1,
+                                                       uint64_t offset2,
+                                                       uint64_t length2);
+};
+
+class TripleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite3, 3> {
+ public:
+  TripleFailedWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2,
+                      uint64_t length2, uint64_t offset3, uint64_t length3);
+  static std::unique_ptr<TripleFailedWriteOp> generate(
+      uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2,
+      uint64_t offset3, uint64_t length3);
+};
+
+template <ceph::io_exerciser::OpType opType>
+class InjectErrorOp : public TestOp<opType> {
+ public:
+  InjectErrorOp(int shard, const std::optional<uint64_t>& type,
+                const std::optional<uint64_t>& when,
+                const std::optional<uint64_t>& duration);
+
+  std::string to_string(uint64_t block_size) const override;
+
+  int shard;
+  std::optional<uint64_t> type;
+  std::optional<uint64_t> when;
+  std::optional<uint64_t> duration;
+
+ protected:
+  virtual inline constexpr std::string_view get_inject_type_string() const = 0;
+};
+
+class InjectReadErrorOp : public InjectErrorOp<OpType::InjectReadError> {
+ public:
+  InjectReadErrorOp(int shard, const std::optional<uint64_t>& type,
+                    const std::optional<uint64_t>& when,
+                    const std::optional<uint64_t>& duration);
+
+  static std::unique_ptr<InjectReadErrorOp> generate(
+      int shard, const std::optional<uint64_t>& type,
+      const std::optional<uint64_t>& when,
+      const std::optional<uint64_t>& duration);
+
+ protected:
+  inline constexpr std::string_view get_inject_type_string() const override {
+    return "read";
+  }
+};
+
+class InjectWriteErrorOp : public InjectErrorOp<OpType::InjectWriteError> {
+ public:
+  InjectWriteErrorOp(int shard, const std::optional<uint64_t>& type,
+                     const std::optional<uint64_t>& when,
+                     const std::optional<uint64_t>& duration);
+
+  static std::unique_ptr<InjectWriteErrorOp> generate(
+      int shard, const std::optional<uint64_t>& type,
+      const std::optional<uint64_t>& when,
+      const std::optional<uint64_t>& duration);
+
+ protected:
+  inline constexpr std::string_view get_inject_type_string() const override {
+    return "write";
+  }
+};
+
+template <ceph::io_exerciser::OpType opType>
+class ClearErrorInjectOp : public TestOp<opType> {
+ public:
+  ClearErrorInjectOp(int shard, const std::optional<uint64_t>& type);
+
+  std::string to_string(uint64_t block_size) const override;
+
+  int shard;
+  std::optional<uint64_t> type;
+
+ protected:
+  virtual inline constexpr std::string_view get_inject_type_string() const = 0;
+};
+
+class ClearReadErrorInjectOp
+    : public ClearErrorInjectOp<OpType::ClearReadErrorInject> {
+ public:
+  ClearReadErrorInjectOp(int shard, const std::optional<uint64_t>& type);
+
+  static std::unique_ptr<ClearReadErrorInjectOp> generate(
+      int shard, const std::optional<uint64_t>& type);
+
+ protected:
+  inline constexpr std::string_view get_inject_type_string() const override {
+    return "read";
+  }
+};
+
+class ClearWriteErrorInjectOp
+    : public ClearErrorInjectOp<OpType::ClearWriteErrorInject> {
+ public:
+  ClearWriteErrorInjectOp(int shard, const std::optional<uint64_t>& type);
+
+  static std::unique_ptr<ClearWriteErrorInjectOp> generate(
+      int shard, const std::optional<uint64_t>& type);
 
-      static std::unique_ptr<IoOp> generate_read2(uint64_t offset1,
-                                                  uint64_t length1,
-                                                  uint64_t offset2,
-                                                  uint64_t length2);
-
-      static std::unique_ptr<IoOp> generate_read3(uint64_t offset1,
-                                                  uint64_t length1,
-                                                  uint64_t offset2,
-                                                  uint64_t length2,
-                                                  uint64_t offset3,
-                                                  uint64_t length3);
-
-      static std::unique_ptr<IoOp> generate_write(uint64_t offset,
-                                                  uint64_t length);
-
-      static std::unique_ptr<IoOp> generate_write2(uint64_t offset1,
-                                                   uint64_t length1,
-                                                   uint64_t offset2,
-                                                   uint64_t length2);
-
-      static std::unique_ptr<IoOp> generate_write3(uint64_t offset1,
-                                                   uint64_t length1,
-                                                   uint64_t offset2,
-                                                   uint64_t length2,
-                                                   uint64_t offset3,
-                                                   uint64_t length3);
-
-      bool done();
-
-      std::string to_string(uint64_t block_size) const;
-    };
+ protected:
+  inline constexpr std::string_view get_inject_type_string() const override {
+    return "write";
   }
-}
-\ No newline at end of file
+};
+}  // namespace io_exerciser
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/io_exerciser/IoSequence.cc b/src/common/io_exerciser/IoSequence.cc
index 4a7ca0593d1..83f1cc595a5 100644
--- a/src/common/io_exerciser/IoSequence.cc
+++ b/src/common/io_exerciser/IoSequence.cc
@@ -1,12 +1,12 @@
 #include "IoSequence.h"
 
+using IoOp = ceph::io_exerciser::IoOp;
 using Sequence = ceph::io_exerciser::Sequence;
 using IoSequence = ceph::io_exerciser::IoSequence;
 
-std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& seq)
-{
-  switch (seq)
-  {
+std::ostream& ceph::io_exerciser::operator<<(std::ostream& os,
+                                             const Sequence& seq) {
+  switch (seq) {
     case Sequence::SEQUENCE_SEQ0:
       os << "SEQUENCE_SEQ0";
       break;
@@ -37,6 +37,9 @@ std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& s
     case Sequence::SEQUENCE_SEQ9:
       os << "SEQUENCE_SEQ9";
       break;
+    case Sequence::SEQUENCE_SEQ10:
+      os << "SEQUENCE_SEQ10";
+      break;
     case Sequence::SEQUENCE_END:
       os << "SEQUENCE_END";
       break;
@@ -44,19 +47,12 @@ std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& s
   return os;
 }
 
-IoSequence::IoSequence(std::pair<int,int> obj_size_range,
-                                           int seed) :
-        min_obj_size(obj_size_range.first), max_obj_size(obj_size_range.second),
-        create(true), barrier(false), done(false), remove(false),
-        obj_size(min_obj_size), step(-1), seed(seed)
-{
-  rng.seed(seed);
+bool IoSequence::is_supported(Sequence sequence) const {
+  return sequence != Sequence::SEQUENCE_SEQ10;
 }
 
-std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s,
-                                                          std::pair<int,int> obj_size_range,
-                                                          int seed)
-{
+std::unique_ptr<IoSequence> IoSequence::generate_sequence(
+    Sequence s, std::pair<int, int> obj_size_range, int seed) {
   switch (s) {
     case Sequence::SEQUENCE_SEQ0:
       return std::make_unique<Seq0>(obj_size_range, seed);
@@ -78,24 +74,39 @@ std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s,
       return std::make_unique<Seq8>(obj_size_range, seed);
     case Sequence::SEQUENCE_SEQ9:
       return std::make_unique<Seq9>(obj_size_range, seed);
+    case Sequence::SEQUENCE_SEQ10:
+      ceph_abort_msg(
+          "Sequence 10 only supported for erasure coded pools "
+          "through the EcIoSequence interface");
+      return nullptr;
     default:
       break;
   }
   return nullptr;
 }
 
-int IoSequence::get_step() const
-{
-  return step;
+IoSequence::IoSequence(std::pair<int, int> obj_size_range, int seed)
+    : min_obj_size(obj_size_range.first),
+      max_obj_size(obj_size_range.second),
+      create(true),
+      barrier(false),
+      done(false),
+      remove(false),
+      obj_size(min_obj_size),
+      step(-1),
+      seed(seed) {
+  rng.seed(seed);
 }
 
-int IoSequence::get_seed() const
-{
-  return seed;
+std::string ceph::io_exerciser::IoSequence::get_name_with_seqseed() const {
+  return get_name() + " (seqseed " + std::to_string(get_seed()) + ")";
 }
 
-void IoSequence::set_min_object_size(uint64_t size)
-{
+int IoSequence::get_step() const { return step; }
+
+int IoSequence::get_seed() const { return seed; }
+
+void IoSequence::set_min_object_size(uint64_t size) {
   min_obj_size = size;
   if (obj_size < size) {
     obj_size = size;
@@ -105,23 +116,20 @@ void IoSequence::set_min_object_size(uint64_t size)
   }
 }
 
-void IoSequence::set_max_object_size(uint64_t size)
-{
+void IoSequence::set_max_object_size(uint64_t size) {
   max_obj_size = size;
   if (obj_size > size) {
     done = true;
   }
 }
 
-void IoSequence::select_random_object_size()
-{
+void IoSequence::select_random_object_size() {
   if (max_obj_size != min_obj_size) {
     obj_size = min_obj_size + rng(max_obj_size - min_obj_size);
   }
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size()
-{
+std::unique_ptr<IoOp> IoSequence::increment_object_size() {
   obj_size++;
   if (obj_size > max_obj_size) {
     done = true;
@@ -129,106 +137,118 @@ std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size()
   create = true;
   barrier = true;
   remove = true;
-  return IoOp::generate_barrier();
+  return BarrierOp::generate();
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::next()
-{
+Sequence IoSequence::getNextSupportedSequenceId() const {
+  Sequence sequence = get_id();
+  ++sequence;
+  for (; sequence < Sequence::SEQUENCE_END; ++sequence) {
+    if (is_supported(sequence)) {
+      return sequence;
+    }
+  }
+
+  return Sequence::SEQUENCE_END;
+}
+
+std::unique_ptr<IoOp> IoSequence::next() {
   step++;
   if (remove) {
     remove = false;
-    return IoOp::generate_remove();
+    return RemoveOp::generate();
   }
   if (barrier) {
     barrier = false;
-    return IoOp::generate_barrier();
+    return BarrierOp::generate();
   }
   if (done) {
-    return IoOp::generate_done();
+    return DoneOp::generate();
   }
   if (create) {
     create = false;
     barrier = true;
-    return IoOp::generate_create(obj_size);
+    return CreateOp::generate(obj_size);
   }
   return _next();
 }
 
-
-
-ceph::io_exerciser::Seq0::Seq0(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset(0)
-{
+ceph::io_exerciser::Seq0::Seq0(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed), offset(0) {
   select_random_object_size();
   length = 1 + rng(obj_size - 1);
 }
 
-std::string ceph::io_exerciser::Seq0::get_name() const
-{
+Sequence ceph::io_exerciser::Seq0::get_id() const {
+  return Sequence::SEQUENCE_SEQ0;
+}
+
+std::string ceph::io_exerciser::Seq0::get_name() const {
   return "Sequential reads of length " + std::to_string(length) +
-    " with queue depth 1 (seqseed " + std::to_string(get_seed()) + ")";
+         " with queue depth 1";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next() {
   std::unique_ptr<IoOp> r;
   if (offset >= obj_size) {
     done = true;
     barrier = true;
     remove = true;
-    return IoOp::generate_barrier();
+    return BarrierOp::generate();
   }
   if (offset + length > obj_size) {
-    r = IoOp::generate_read(offset, obj_size - offset);
+    r = SingleReadOp::generate(offset, obj_size - offset);
   } else {
-    r = IoOp::generate_read(offset, length);
+    r = SingleReadOp::generate(offset, length);
   }
   offset += length;
   return r;
 }
 
-
-
-ceph::io_exerciser::Seq1::Seq1(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed)
-{
+ceph::io_exerciser::Seq1::Seq1(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed) {
   select_random_object_size();
   count = 3 * obj_size;
 }
 
-std::string ceph::io_exerciser::Seq1::get_name() const
-{
-  return "Random offset, random length read/write I/O with queue depth 1 (seqseed "
-    + std::to_string(get_seed()) + ")";
+Sequence ceph::io_exerciser::Seq1::get_id() const {
+  return Sequence::SEQUENCE_SEQ1;
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next()
-{
+std::string ceph::io_exerciser::Seq1::get_name() const {
+  return "Random offset, random length read/write I/O with queue depth 1";
+}
+
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next() {
   barrier = true;
   if (count-- == 0) {
     done = true;
     remove = true;
-    return IoOp::generate_barrier();
+    return BarrierOp::generate();
   }
 
   uint64_t offset = rng(obj_size - 1);
   uint64_t length = 1 + rng(obj_size - 1 - offset);
-  return (rng(2) != 0) ? IoOp::generate_write(offset, length) :
-    IoOp::generate_read(offset, length);
-}
 
+  if (rng(2) != 0) {
+    return SingleWriteOp::generate(offset, length);
+  } else {
+    return SingleReadOp::generate(offset, length);
+  }
+}
 
+ceph::io_exerciser::Seq2::Seq2(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed), offset(0), length(0) {}
 
-ceph::io_exerciser::Seq2::Seq2(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset(0), length(0) {}
+Sequence ceph::io_exerciser::Seq2::get_id() const {
+  return Sequence::SEQUENCE_SEQ2;
+}
 
-std::string ceph::io_exerciser::Seq2::get_name() const
-{
+std::string ceph::io_exerciser::Seq2::get_name() const {
   return "Permutations of offset and length read I/O";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() {
   length++;
   if (length > obj_size - offset) {
     length = 1;
@@ -239,24 +259,23 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next()
       return increment_object_size();
     }
   }
-  return IoOp::generate_read(offset, length);
+  return SingleReadOp::generate(offset, length);
 }
 
-
-
-ceph::io_exerciser::Seq3::Seq3(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset1(0), offset2(0)
-{
+ceph::io_exerciser::Seq3::Seq3(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed), offset1(0), offset2(0) {
   set_min_object_size(2);
 }
 
-std::string ceph::io_exerciser::Seq3::get_name() const
-{
+Sequence ceph::io_exerciser::Seq3::get_id() const {
+  return Sequence::SEQUENCE_SEQ3;
+}
+
+std::string ceph::io_exerciser::Seq3::get_name() const {
   return "Permutations of offset 2-region 1-block read I/O";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() {
   offset2++;
   if (offset2 >= obj_size - offset1) {
     offset2 = 1;
@@ -267,24 +286,23 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next()
       return increment_object_size();
     }
   }
-  return IoOp::generate_read2(offset1, 1, offset1 + offset2, 1);
+  return DoubleReadOp::generate(offset1, 1, offset1 + offset2, 1);
 }
 
-
-
-ceph::io_exerciser::Seq4::Seq4(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset1(0), offset2(1)
-{
+ceph::io_exerciser::Seq4::Seq4(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed), offset1(0), offset2(1) {
   set_min_object_size(3);
 }
 
-std::string ceph::io_exerciser::Seq4::get_name() const
-{
+Sequence ceph::io_exerciser::Seq4::get_id() const {
+  return Sequence::SEQUENCE_SEQ4;
+}
+
+std::string ceph::io_exerciser::Seq4::get_name() const {
   return "Permutations of offset 3-region 1-block read I/O";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() {
   offset2++;
   if (offset2 >= obj_size - offset1) {
     offset2 = 2;
@@ -295,33 +313,35 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next()
       return increment_object_size();
     }
   }
-  return IoOp::generate_read3(offset1, 1,
-                              offset1 + offset2, 1,
-                              (offset1 * 2 + offset2)/2, 1);
+  return TripleReadOp::generate(offset1, 1, (offset1 + offset2), 1,
+                                (offset1 * 2 + offset2) / 2, 1);
 }
 
+ceph::io_exerciser::Seq5::Seq5(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed),
+      offset(0),
+      length(1),
+      doneread(false),
+      donebarrier(false) {}
 
+Sequence ceph::io_exerciser::Seq5::get_id() const {
+  return Sequence::SEQUENCE_SEQ5;
+}
 
-ceph::io_exerciser::Seq5::Seq5(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset(0), length(1),
-  doneread(false), donebarrier(false) {}
-
-std::string ceph::io_exerciser::Seq5::get_name() const
-{
+std::string ceph::io_exerciser::Seq5::get_name() const {
   return "Permutation of length sequential writes";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() {
   if (offset >= obj_size) {
     if (!doneread) {
       if (!donebarrier) {
         donebarrier = true;
-        return IoOp::generate_barrier();
+        return BarrierOp::generate();
       }
       doneread = true;
       barrier = true;
-      return IoOp::generate_read(0, obj_size);
+      return SingleReadOp::generate(0, obj_size);
     }
     doneread = false;
     donebarrier = false;
@@ -333,33 +353,36 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next()
     }
   }
   uint64_t io_len = (offset + length > obj_size) ? (obj_size - offset) : length;
-  std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len);
+  std::unique_ptr<IoOp> r = SingleWriteOp::generate(offset, io_len);
   offset += io_len;
   return r;
 }
 
+ceph::io_exerciser::Seq6::Seq6(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed),
+      offset(0),
+      length(1),
+      doneread(false),
+      donebarrier(false) {}
 
+Sequence ceph::io_exerciser::Seq6::get_id() const {
+  return Sequence::SEQUENCE_SEQ6;
+}
 
-ceph::io_exerciser::Seq6::Seq6(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset(0), length(1),
-  doneread(false), donebarrier(false) {}
-
-std::string ceph::io_exerciser::Seq6::get_name() const
-{
+std::string ceph::io_exerciser::Seq6::get_name() const {
   return "Permutation of length sequential writes, different alignment";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() {
   if (offset >= obj_size) {
     if (!doneread) {
       if (!donebarrier) {
         donebarrier = true;
-        return IoOp::generate_barrier();
+        return BarrierOp::generate();
       }
       doneread = true;
       barrier = true;
-      return IoOp::generate_read(0, obj_size);
+      return SingleReadOp::generate(0, obj_size);
     }
     doneread = false;
     donebarrier = false;
@@ -374,74 +397,72 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next()
   if (io_len == 0) {
     io_len = length;
   }
-  std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len);
+  std::unique_ptr<IoOp> r = SingleWriteOp::generate(offset, io_len);
   offset += io_len;
   return r;
 }
 
-
-
-ceph::io_exerciser::Seq7::Seq7(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed)
-{
+ceph::io_exerciser::Seq7::Seq7(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed) {
   set_min_object_size(2);
   offset = obj_size;
 }
 
-std::string ceph::io_exerciser::Seq7::get_name() const
-{
+Sequence ceph::io_exerciser::Seq7::get_id() const {
+  return Sequence::SEQUENCE_SEQ7;
+}
+
+std::string ceph::io_exerciser::Seq7::get_name() const {
   return "Permutations of offset 2-region 1-block writes";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next() {
   if (!doneread) {
     if (!donebarrier) {
       donebarrier = true;
-      return IoOp::generate_barrier();
+      return BarrierOp::generate();
     }
     doneread = true;
     barrier = true;
-    return IoOp::generate_read(0, obj_size);
+    return SingleReadOp::generate(0, obj_size);
   }
   if (offset == 0) {
     doneread = false;
     donebarrier = false;
-    offset = obj_size+1;
+    offset = obj_size + 1;
     return increment_object_size();
   }
   offset--;
-  if (offset == obj_size/2) {
+  if (offset == obj_size / 2) {
     return _next();
   }
   doneread = false;
   donebarrier = false;
-  return IoOp::generate_write2(offset, 1, obj_size/2, 1);
+  return DoubleReadOp::generate(offset, 1, obj_size / 2, 1);
 }
 
-
-
-ceph::io_exerciser::Seq8::Seq8(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset1(0), offset2(1)
-{
+ceph::io_exerciser::Seq8::Seq8(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed), offset1(0), offset2(1) {
   set_min_object_size(3);
 }
 
-std::string ceph::io_exerciser::Seq8::get_name() const
-{
+Sequence ceph::io_exerciser::Seq8::get_id() const {
+  return Sequence::SEQUENCE_SEQ8;
+}
+
+std::string ceph::io_exerciser::Seq8::get_name() const {
   return "Permutations of offset 3-region 1-block write I/O";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() {
   if (!doneread) {
     if (!donebarrier) {
       donebarrier = true;
-      return IoOp::generate_barrier();
+      return BarrierOp::generate();
     }
     doneread = true;
     barrier = true;
-    return IoOp::generate_read(0, obj_size);
+    return SingleReadOp::generate(0, obj_size);
   }
   offset2++;
   if (offset2 >= obj_size - offset1) {
@@ -455,34 +476,30 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next()
   }
   doneread = false;
   donebarrier = false;
-  return IoOp::generate_write3(offset1, 1,
-                              offset1 + offset2, 1,
-                              (offset1 * 2 + offset2)/2, 1);
+  return TripleWriteOp::generate(offset1, 1, offset1 + offset2, 1,
+                                 (offset1 * 2 + offset2) / 2, 1);
 }
 
+ceph::io_exerciser::Seq9::Seq9(std::pair<int, int> obj_size_range, int seed)
+    : IoSequence(obj_size_range, seed), offset(0), length(0) {}
 
-
-ceph::io_exerciser::Seq9::Seq9(std::pair<int,int> obj_size_range, int seed) :
-  IoSequence(obj_size_range, seed), offset(0), length(0)
-{
-  
+Sequence ceph::io_exerciser::Seq9::get_id() const {
+  return Sequence::SEQUENCE_SEQ9;
 }
 
-std::string ceph::io_exerciser::Seq9::get_name() const
-{
+std::string ceph::io_exerciser::Seq9::get_name() const {
   return "Permutations of offset and length write I/O";
 }
 
-std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next()
-{
+std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() {
   if (!doneread) {
     if (!donebarrier) {
       donebarrier = true;
-      return IoOp::generate_barrier();
+      return BarrierOp::generate();
     }
     doneread = true;
     barrier = true;
-    return IoOp::generate_read(0, obj_size);
+    return SingleReadOp::generate(0, obj_size);
   }
   length++;
   if (length > obj_size - offset) {
@@ -496,5 +513,5 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next()
   }
   doneread = false;
   donebarrier = false;
-  return IoOp::generate_write(offset, length);
+  return SingleWriteOp::generate(offset, length);
 }
 \ No newline at end of file
diff --git a/src/common/io_exerciser/IoSequence.h b/src/common/io_exerciser/IoSequence.h
index 114ff76303f..b6c254cf096 100644
--- a/src/common/io_exerciser/IoSequence.h
+++ b/src/common/io_exerciser/IoSequence.h
@@ -3,7 +3,6 @@
 #pragma once
 
 #include "IoOp.h"
-
 #include "include/random.h"
 
 /* Overview
@@ -29,195 +28,209 @@
  */
 
 namespace ceph {
-  namespace io_exerciser {
-
-    enum class Sequence {
-      SEQUENCE_SEQ0,
-      SEQUENCE_SEQ1,
-      SEQUENCE_SEQ2,
-      SEQUENCE_SEQ3,
-      SEQUENCE_SEQ4,
-      SEQUENCE_SEQ5,
-      SEQUENCE_SEQ6,
-      SEQUENCE_SEQ7,
-      SEQUENCE_SEQ8,
-      SEQUENCE_SEQ9,
-      //
-      SEQUENCE_END,
-      SEQUENCE_BEGIN = SEQUENCE_SEQ0
-    };
-
-    inline Sequence operator++( Sequence& s )
-    {
-      return s = (Sequence)(((int)(s) + 1));
-    }
-
-    std::ostream& operator<<(std::ostream& os, const Sequence& seq);
-
-    /* I/O Sequences */
-
-    class IoSequence {
-    public:
-      virtual ~IoSequence() = default;
-
-      virtual std::string get_name() const = 0;
-      int get_step() const;
-      int get_seed() const;
-
-      std::unique_ptr<IoOp> next();
-
-      static std::unique_ptr<IoSequence>
-        generate_sequence(Sequence s, std::pair<int,int> obj_size_range, int seed );
-
-    protected:
-      uint64_t min_obj_size;
-      uint64_t max_obj_size;
-      bool create;
-      bool barrier;
-      bool done;
-      bool remove;
-      uint64_t obj_size;
-      int step;
-      int seed;
-      ceph::util::random_number_generator<int> rng =
-        ceph::util::random_number_generator<int>();
-
-      IoSequence(std::pair<int,int> obj_size_range, int seed);
-
-      virtual std::unique_ptr<IoOp> _next() = 0;
-
-      void set_min_object_size(uint64_t size);
-      void set_max_object_size(uint64_t size);
-      void select_random_object_size();
-      std::unique_ptr<IoOp> increment_object_size();
-
-    };
-
-    class Seq0: public IoSequence {
-    public:
-      Seq0(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-
-    private:
-      uint64_t offset;
-      uint64_t length;
-    };
-
-    class Seq1: public IoSequence {  
-    public:
-      Seq1(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next();
-
-    private:
-      int count;
-    };
-      
-    class Seq2: public IoSequence {
-    public:
-      Seq2(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-    
-    private:
-      uint64_t offset;
-      uint64_t length;
-    };
-
-    class Seq3: public IoSequence {
-    public:
-      Seq3(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-    private:
-      uint64_t offset1;
-      uint64_t offset2;
-    };
-
-    class Seq4: public IoSequence {
-    public:
-      Seq4(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-
-    private:
-      uint64_t offset1;
-      uint64_t offset2;
-    };
-
-    class Seq5: public IoSequence {
-    public:
-      Seq5(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-
-    private:
-      uint64_t offset;
-      uint64_t length;
-      bool doneread;
-      bool donebarrier;
-    };
-
-    class Seq6: public IoSequence {
-    public:
-      Seq6(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-
-    private:
-      uint64_t offset;
-      uint64_t length;
-      bool doneread;
-      bool donebarrier;
-    };
-
-    class Seq7: public IoSequence {
-    public:
-      Seq7(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-
-    private:
-      uint64_t offset;
-      bool doneread = true;
-      bool donebarrier = false;
-    };
-
-    class Seq8: public IoSequence {
-    public:
-      Seq8(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-      std::unique_ptr<IoOp> _next() override;
-    private:
-      uint64_t offset1;
-      uint64_t offset2;
-      bool doneread = true;
-      bool donebarrier = false;
-    };
-
-    class Seq9: public IoSequence {
-    private:
-      uint64_t offset;
-      uint64_t length;
-      bool doneread = true;
-      bool donebarrier = false;
-
-    public:
-      Seq9(std::pair<int,int> obj_size_range, int seed);
-
-      std::string get_name() const override;
-
-      std::unique_ptr<IoOp> _next() override;
-    };
-  }
-}
-\ No newline at end of file
+namespace io_exerciser {
+
+enum class Sequence {
+  SEQUENCE_SEQ0,
+  SEQUENCE_SEQ1,
+  SEQUENCE_SEQ2,
+  SEQUENCE_SEQ3,
+  SEQUENCE_SEQ4,
+  SEQUENCE_SEQ5,
+  SEQUENCE_SEQ6,
+  SEQUENCE_SEQ7,
+  SEQUENCE_SEQ8,
+  SEQUENCE_SEQ9,
+  SEQUENCE_SEQ10,
+
+  SEQUENCE_END,
+  SEQUENCE_BEGIN = SEQUENCE_SEQ0
+};
+
+inline Sequence operator++(Sequence& s) {
+  return s = (Sequence)(((int)(s) + 1));
+}
+
+std::ostream& operator<<(std::ostream& os, const Sequence& seq);
+
+/* I/O Sequences */
+
+class IoSequence {
+ public:
+  virtual ~IoSequence() = default;
+
+  virtual Sequence get_id() const = 0;
+  virtual std::string get_name_with_seqseed() const;
+  virtual std::string get_name() const = 0;
+  int get_step() const;
+  int get_seed() const;
+
+  virtual Sequence getNextSupportedSequenceId() const;
+  virtual std::unique_ptr<IoOp> next();
+
+  virtual bool is_supported(Sequence sequence) const;
+  static std::unique_ptr<IoSequence> generate_sequence(
+      Sequence s, std::pair<int, int> obj_size_range, int seed);
+
+ protected:
+  uint64_t min_obj_size;
+  uint64_t max_obj_size;
+  bool create;
+  bool barrier;
+  bool done;
+  bool remove;
+  uint64_t obj_size;
+  int step;
+  int seed;
+  ceph::util::random_number_generator<int> rng =
+      ceph::util::random_number_generator<int>();
+
+  IoSequence(std::pair<int, int> obj_size_range, int seed);
+
+  virtual std::unique_ptr<IoOp> _next() = 0;
+
+  void set_min_object_size(uint64_t size);
+  void set_max_object_size(uint64_t size);
+  void select_random_object_size();
+  std::unique_ptr<IoOp> increment_object_size();
+};
+
+class Seq0 : public IoSequence {
+ public:
+  Seq0(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset;
+  uint64_t length;
+};
+
+class Seq1 : public IoSequence {
+ public:
+  Seq1(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  int count;
+};
+
+class Seq2 : public IoSequence {
+ public:
+  Seq2(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset;
+  uint64_t length;
+};
+
+class Seq3 : public IoSequence {
+ public:
+  Seq3(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset1;
+  uint64_t offset2;
+};
+
+class Seq4 : public IoSequence {
+ public:
+  Seq4(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset1;
+  uint64_t offset2;
+};
+
+class Seq5 : public IoSequence {
+ public:
+  Seq5(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset;
+  uint64_t length;
+  bool doneread;
+  bool donebarrier;
+};
+
+class Seq6 : public IoSequence {
+ public:
+  Seq6(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset;
+  uint64_t length;
+  bool doneread;
+  bool donebarrier;
+};
+
+class Seq7 : public IoSequence {
+ public:
+  Seq7(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset;
+  bool doneread = true;
+  bool donebarrier = false;
+};
+
+class Seq8 : public IoSequence {
+ public:
+  Seq8(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+
+ private:
+  uint64_t offset1;
+  uint64_t offset2;
+  bool doneread = true;
+  bool donebarrier = false;
+};
+
+class Seq9 : public IoSequence {
+ private:
+  uint64_t offset;
+  uint64_t length;
+  bool doneread = true;
+  bool donebarrier = false;
+
+ public:
+  Seq9(std::pair<int, int> obj_size_range, int seed);
+
+  Sequence get_id() const override;
+  std::string get_name() const override;
+  std::unique_ptr<IoOp> _next() override;
+};
+}  // namespace io_exerciser
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/io_exerciser/Model.cc b/src/common/io_exerciser/Model.cc
index 50812ecbb15..6548e1eda7a 100644
--- a/src/common/io_exerciser/Model.cc
+++ b/src/common/io_exerciser/Model.cc
@@ -4,25 +4,11 @@
 
 using Model = ceph::io_exerciser::Model;
 
-Model::Model(const std::string& oid, uint64_t block_size) : 
-num_io(0),
-oid(oid),
-block_size(block_size)
-{
+Model::Model(const std::string& oid, uint64_t block_size)
+    : num_io(0), oid(oid), block_size(block_size) {}
 
-}
+const uint64_t Model::get_block_size() const { return block_size; }
 
-const uint64_t Model::get_block_size() const
-{
-  return block_size;
-}
+const std::string Model::get_oid() const { return oid; }
 
-const std::string Model::get_oid() const
-{
-  return oid;
-}
-
-int Model::get_num_io() const
-{
-  return num_io;
-}
-\ No newline at end of file
+int Model::get_num_io() const { return num_io; }
+\ No newline at end of file
diff --git a/src/common/io_exerciser/Model.h b/src/common/io_exerciser/Model.h
index 58d107409a6..9e421e79a78 100644
--- a/src/common/io_exerciser/Model.h
+++ b/src/common/io_exerciser/Model.h
@@ -1,15 +1,13 @@
 #pragma once
 
-#include "IoOp.h"
-
 #include <boost/asio/io_context.hpp>
 
-#include "librados/librados_asio.h"
-
-#include "include/interval_set.h"
-#include "global/global_init.h"
-#include "global/global_context.h"
+#include "IoOp.h"
 #include "common/Thread.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "include/interval_set.h"
+#include "librados/librados_asio.h"
 
 /* Overview
  *
@@ -21,29 +19,27 @@
  */
 
 namespace ceph {
-  namespace io_exerciser {
-
-    class Model
-    {
-    protected:
-      int num_io{0};
-      std::string oid;
-      uint64_t block_size;
-
-    public:
-      Model(const std::string& oid, uint64_t block_size);
-      virtual ~Model() = default;
-
-      virtual bool readyForIoOp(IoOp& op) = 0;
-      virtual void applyIoOp(IoOp& op) = 0;
-      
-      const std::string get_oid() const;
-      const uint64_t get_block_size() const;
-      int get_num_io() const;
-    };
-
-    /* Simple RADOS I/O generator */
-
-    
-  }
-}
-\ No newline at end of file
+namespace io_exerciser {
+
+class Model {
+ protected:
+  int num_io{0};
+  std::string oid;
+  uint64_t block_size;
+
+ public:
+  Model(const std::string& oid, uint64_t block_size);
+  virtual ~Model() = default;
+
+  virtual bool readyForIoOp(IoOp& op) = 0;
+  virtual void applyIoOp(IoOp& op) = 0;
+
+  const std::string get_oid() const;
+  const uint64_t get_block_size() const;
+  int get_num_io() const;
+};
+
+/* Simple RADOS I/O generator */
+
+}  // namespace io_exerciser
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/io_exerciser/ObjectModel.cc b/src/common/io_exerciser/ObjectModel.cc
index 589f6434282..454d7254cf2 100644
--- a/src/common/io_exerciser/ObjectModel.cc
+++ b/src/common/io_exerciser/ObjectModel.cc
@@ -6,25 +6,20 @@
 
 using ObjectModel = ceph::io_exerciser::ObjectModel;
 
-ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) :
-  Model(oid, block_size), created(false)
-{
+ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed)
+    : Model(oid, block_size), created(false) {
   rng.seed(seed);
 }
 
-int ObjectModel::get_seed(uint64_t offset) const
-{
+int ObjectModel::get_seed(uint64_t offset) const {
   ceph_assert(offset < contents.size());
   return contents[offset];
 }
 
-std::vector<int> ObjectModel::get_seed_offsets(int seed) const
-{
+std::vector<int> ObjectModel::get_seed_offsets(int seed) const {
   std::vector<int> offsets;
-  for (size_t i = 0; i < contents.size(); i++)
-  {
-    if (contents[i] == seed)
-    {
+  for (size_t i = 0; i < contents.size(); i++) {
+    if (contents[i] == seed) {
       offsets.push_back(i);
     }
   }
@@ -32,8 +27,7 @@ std::vector<int> ObjectModel::get_seed_offsets(int seed) const
   return offsets;
 }
 
-std::string ObjectModel::to_string(int mask) const
-{
+std::string ObjectModel::to_string(int mask) const {
   if (!created) {
     return "Object does not exist";
   }
@@ -48,107 +42,127 @@ std::string ObjectModel::to_string(int mask) const
   return result;
 }
 
-bool ObjectModel::readyForIoOp(IoOp& op)
-{
-  return true;
-}
-
-void ObjectModel::applyIoOp(IoOp& op)
-{
-  auto generate_random = [&rng = rng]() {
-    return rng();
-  };
-
-  switch (op.op) {
-  case OpType::BARRIER:
-    reads.clear();
-    writes.clear();
-    break;
-
-  case OpType::CREATE:
-    ceph_assert(!created);
-    ceph_assert(reads.empty());
-    ceph_assert(writes.empty());
-    created = true;
-    contents.resize(op.length1);
-    std::generate(std::execution::seq, contents.begin(), contents.end(),
-                  generate_random);
-    break;
-
-  case OpType::REMOVE:
-    ceph_assert(created);
-    ceph_assert(reads.empty());
-    ceph_assert(writes.empty());
-    created = false;
-    contents.resize(0);
-    break;
-
-  case OpType::READ3:
-    ceph_assert(created);
-    ceph_assert(op.offset3 + op.length3 <= contents.size());
-    // Not allowed: read overlapping with parallel write
-    ceph_assert(!writes.intersects(op.offset3, op.length3));
-    reads.union_insert(op.offset3, op.length3);
-    [[fallthrough]];
-
-  case OpType::READ2:
-    ceph_assert(created);
-    ceph_assert(op.offset2 + op.length2 <= contents.size());
-    // Not allowed: read overlapping with parallel write
-    ceph_assert(!writes.intersects(op.offset2, op.length2));
-    reads.union_insert(op.offset2, op.length2);
-    [[fallthrough]];
-
-  case OpType::READ:
-    ceph_assert(created);
-    ceph_assert(op.offset1 + op.length1 <= contents.size());
-    // Not allowed: read overlapping with parallel write
-    ceph_assert(!writes.intersects(op.offset1, op.length1));
-    reads.union_insert(op.offset1, op.length1);
-    num_io++;
-    break;
-
-  case OpType::WRITE3:
-    ceph_assert(created);
-    // Not allowed: write overlapping with parallel read or write
-    ceph_assert(!reads.intersects(op.offset3, op.length3));
-    ceph_assert(!writes.intersects(op.offset3, op.length3));
-    writes.union_insert(op.offset3, op.length3);
-    ceph_assert(op.offset3 + op.length3 <= contents.size());
-    std::generate(std::execution::seq,
-                  std::next(contents.begin(), op.offset3),
-                  std::next(contents.begin(), op.offset3 + op.length3),
-                  generate_random);
-    [[fallthrough]];
-
-  case OpType::WRITE2:
-    ceph_assert(created);
-    // Not allowed: write overlapping with parallel read or write
-    ceph_assert(!reads.intersects(op.offset2, op.length2));
-    ceph_assert(!writes.intersects(op.offset2, op.length2));
-    writes.union_insert(op.offset2, op.length2);
-    ceph_assert(op.offset2 + op.length2 <= contents.size());
-    std::generate(std::execution::seq,
-                  std::next(contents.begin(), op.offset2),
-                  std::next(contents.begin(), op.offset2 + op.length2),
-                  generate_random);
-    [[fallthrough]];
-
-  case OpType::WRITE:
-    ceph_assert(created);
-    // Not allowed: write overlapping with parallel read or write
-    ceph_assert(!reads.intersects(op.offset1, op.length1));
-    ceph_assert(!writes.intersects(op.offset1, op.length1));
-    writes.union_insert(op.offset1, op.length1);
-    ceph_assert(op.offset1 + op.length1 <= contents.size());
-    std::generate(std::execution::seq,
-                  std::next(contents.begin(), op.offset1),
-                  std::next(contents.begin(), op.offset1 + op.length1),
-                  generate_random);
-    num_io++;
-    break;
-  default:
-    break;
+bool ObjectModel::readyForIoOp(IoOp& op) { return true; }
+
+void ObjectModel::applyIoOp(IoOp& op) {
+  auto generate_random = [&rng = rng]() { return rng(); };
+
+  auto verify_and_record_read_op =
+      [&contents = contents, &created = created, &num_io = num_io,
+       &reads = reads,
+       &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N>& readOp) {
+        ceph_assert(created);
+        for (int i = 0; i < N; i++) {
+          ceph_assert(readOp.offset[i] + readOp.length[i] <= contents.size());
+          // Not allowed: read overlapping with parallel write
+          ceph_assert(!writes.intersects(readOp.offset[i], readOp.length[i]));
+          reads.union_insert(readOp.offset[i], readOp.length[i]);
+        }
+        num_io++;
+      };
+
+  auto verify_write_and_record_and_generate_seed =
+      [&generate_random, &contents = contents, &created = created,
+       &num_io = num_io, &reads = reads,
+       &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N> writeOp) {
+        ceph_assert(created);
+        for (int i = 0; i < N; i++) {
+          // Not allowed: write overlapping with parallel read or write
+          ceph_assert(!reads.intersects(writeOp.offset[i], writeOp.length[i]));
+          ceph_assert(!writes.intersects(writeOp.offset[i], writeOp.length[i]));
+          writes.union_insert(writeOp.offset[i], writeOp.length[i]);
+          ceph_assert(writeOp.offset[i] + writeOp.length[i] <= contents.size());
+          std::generate(std::execution::seq,
+                        std::next(contents.begin(), writeOp.offset[i]),
+                        std::next(contents.begin(),
+                                  writeOp.offset[i] + writeOp.length[i]),
+                        generate_random);
+        }
+        num_io++;
+      };
+
+  auto verify_failed_write_and_record =
+      [&contents = contents, &created = created, &num_io = num_io,
+       &reads = reads,
+       &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N> writeOp) {
+        // Ensure write should still be valid, even though we are expecting OSD
+        // failure
+        ceph_assert(created);
+        for (int i = 0; i < N; i++) {
+          // Not allowed: write overlapping with parallel read or write
+          ceph_assert(!reads.intersects(writeOp.offset[i], writeOp.length[i]));
+          ceph_assert(!writes.intersects(writeOp.offset[i], writeOp.length[i]));
+          writes.union_insert(writeOp.offset[i], writeOp.length[i]);
+          ceph_assert(writeOp.offset[i] + writeOp.length[i] <= contents.size());
+        }
+        num_io++;
+      };
+
+  switch (op.getOpType()) {
+    case OpType::Barrier:
+      reads.clear();
+      writes.clear();
+      break;
+
+    case OpType::Create:
+      ceph_assert(!created);
+      ceph_assert(reads.empty());
+      ceph_assert(writes.empty());
+      created = true;
+      contents.resize(static_cast<CreateOp&>(op).size);
+      std::generate(std::execution::seq, contents.begin(), contents.end(),
+                    generate_random);
+      break;
+
+    case OpType::Remove:
+      ceph_assert(created);
+      ceph_assert(reads.empty());
+      ceph_assert(writes.empty());
+      created = false;
+      contents.resize(0);
+      break;
+
+    case OpType::Read: {
+      SingleReadOp& readOp = static_cast<SingleReadOp&>(op);
+      verify_and_record_read_op(readOp);
+    } break;
+    case OpType::Read2: {
+      DoubleReadOp& readOp = static_cast<DoubleReadOp&>(op);
+      verify_and_record_read_op(readOp);
+    } break;
+    case OpType::Read3: {
+      TripleReadOp& readOp = static_cast<TripleReadOp&>(op);
+      verify_and_record_read_op(readOp);
+    } break;
+
+    case OpType::Write: {
+      ceph_assert(created);
+      SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op);
+      verify_write_and_record_and_generate_seed(writeOp);
+    } break;
+    case OpType::Write2: {
+      DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op);
+      verify_write_and_record_and_generate_seed(writeOp);
+    } break;
+    case OpType::Write3: {
+      TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op);
+      verify_write_and_record_and_generate_seed(writeOp);
+    } break;
+    case OpType::FailedWrite: {
+      ceph_assert(created);
+      SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op);
+      verify_failed_write_and_record(writeOp);
+    } break;
+    case OpType::FailedWrite2: {
+      DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op);
+      verify_failed_write_and_record(writeOp);
+    } break;
+    case OpType::FailedWrite3: {
+      TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op);
+      verify_failed_write_and_record(writeOp);
+    } break;
+    default:
+      break;
   }
 }
 
diff --git a/src/common/io_exerciser/ObjectModel.h b/src/common/io_exerciser/ObjectModel.h
index 93c70f41429..cad1307b84e 100644
--- a/src/common/io_exerciser/ObjectModel.h
+++ b/src/common/io_exerciser/ObjectModel.h
@@ -14,40 +14,41 @@
  */
 
 namespace ceph {
-  namespace io_exerciser {
-    /* Model of an object to track its data contents */
-
-    class ObjectModel : public Model {
-    private:
-      bool created;
-      std::vector<int> contents;
-      ceph::util::random_number_generator<int> rng =
-        ceph::util::random_number_generator<int>();
-
-      // Track read and write I/Os that can be submitted in
-      // parallel to detect violations:
-      //
-      // * Read may not overlap with a parallel write
-      // * Write may not overlap with a parallel read or write
-      // * Create / remove may not be in parallel with read or write
-      //
-      // Fix broken test cases by adding barrier ops to restrict
-      // I/O exercisers from issuing conflicting ops in parallel
-      interval_set<uint64_t> reads;
-      interval_set<uint64_t> writes;
-    public:
-      ObjectModel(const std::string& oid, uint64_t block_size, int seed);
-      
-      int get_seed(uint64_t offset) const;
-      std::vector<int> get_seed_offsets(int seed) const;
-
-      std::string to_string(int mask = -1) const;
-
-      bool readyForIoOp(IoOp& op);
-      void applyIoOp(IoOp& op);
-      
-      void encode(ceph::buffer::list& bl) const;
-      void decode(ceph::buffer::list::const_iterator& bl);
-    };
-  }
-}
-\ No newline at end of file
+namespace io_exerciser {
+/* Model of an object to track its data contents */
+
+class ObjectModel : public Model {
+ private:
+  bool created;
+  std::vector<int> contents;
+  ceph::util::random_number_generator<int> rng =
+      ceph::util::random_number_generator<int>();
+
+  // Track read and write I/Os that can be submitted in
+  // parallel to detect violations:
+  //
+  // * Read may not overlap with a parallel write
+  // * Write may not overlap with a parallel read or write
+  // * Create / remove may not be in parallel with read or write
+  //
+  // Fix broken test cases by adding barrier ops to restrict
+  // I/O exercisers from issuing conflicting ops in parallel
+  interval_set<uint64_t> reads;
+  interval_set<uint64_t> writes;
+
+ public:
+  ObjectModel(const std::string& oid, uint64_t block_size, int seed);
+
+  int get_seed(uint64_t offset) const;
+  std::vector<int> get_seed_offsets(int seed) const;
+
+  std::string to_string(int mask = -1) const;
+
+  bool readyForIoOp(IoOp& op);
+  void applyIoOp(IoOp& op);
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+};
+}  // namespace io_exerciser
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/io_exerciser/OpType.h b/src/common/io_exerciser/OpType.h
new file mode 100644
index 00000000000..7cddb805e45
--- /dev/null
+++ b/src/common/io_exerciser/OpType.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <fmt/format.h>
+#include <include/ceph_assert.h>
+
+/* Overview
+ *
+ * enum OpType
+ *   Enumeration of different types of I/O operation
+ *
+ */
+
+namespace ceph {
+namespace io_exerciser {
+enum class OpType {
+  Done,                  // End of I/O sequence
+  Barrier,               // Barrier - all prior I/Os must complete
+  Create,                // Create object and pattern with data
+  Remove,                // Remove object
+  Read,                  // Read
+  Read2,                 // Two reads in a single op
+  Read3,                 // Three reads in a single op
+  Write,                 // Write
+  Write2,                // Two writes in a single op
+  Write3,                // Three writes in a single op
+  FailedWrite,           // A write which should fail
+  FailedWrite2,          // Two writes in one op which should fail
+  FailedWrite3,          // Three writes in one op which should fail
+  InjectReadError,       // Op to tell OSD to inject read errors
+  InjectWriteError,      // Op to tell OSD to inject write errors
+  ClearReadErrorInject,  // Op to tell OSD to clear read error injects
+  ClearWriteErrorInject  // Op to tell OSD to clear write error injects
+};
+
+enum class InjectOpType {
+  None,
+  ReadEIO,
+  ReadMissingShard,
+  WriteFailAndRollback,
+  WriteOSDAbort
+};
+}  // namespace io_exerciser
+}  // namespace ceph
+
+template <>
+struct fmt::formatter<ceph::io_exerciser::OpType> {
+  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+  auto format(ceph::io_exerciser::OpType opType,
+              fmt::format_context& ctx) const -> fmt::format_context::iterator {
+    switch (opType) {
+      case ceph::io_exerciser::OpType::Done:
+        return fmt::format_to(ctx.out(), "Done");
+      case ceph::io_exerciser::OpType::Barrier:
+        return fmt::format_to(ctx.out(), "Barrier");
+      case ceph::io_exerciser::OpType::Create:
+        return fmt::format_to(ctx.out(), "Create");
+      case ceph::io_exerciser::OpType::Remove:
+        return fmt::format_to(ctx.out(), "Remove");
+      case ceph::io_exerciser::OpType::Read:
+        return fmt::format_to(ctx.out(), "Read");
+      case ceph::io_exerciser::OpType::Read2:
+        return fmt::format_to(ctx.out(), "Read2");
+      case ceph::io_exerciser::OpType::Read3:
+        return fmt::format_to(ctx.out(), "Read3");
+      case ceph::io_exerciser::OpType::Write:
+        return fmt::format_to(ctx.out(), "Write");
+      case ceph::io_exerciser::OpType::Write2:
+        return fmt::format_to(ctx.out(), "Write2");
+      case ceph::io_exerciser::OpType::Write3:
+        return fmt::format_to(ctx.out(), "Write3");
+      case ceph::io_exerciser::OpType::FailedWrite:
+        return fmt::format_to(ctx.out(), "FailedWrite");
+      case ceph::io_exerciser::OpType::FailedWrite2:
+        return fmt::format_to(ctx.out(), "FailedWrite2");
+      case ceph::io_exerciser::OpType::FailedWrite3:
+        return fmt::format_to(ctx.out(), "FailedWrite3");
+      case ceph::io_exerciser::OpType::InjectReadError:
+        return fmt::format_to(ctx.out(), "InjectReadError");
+      case ceph::io_exerciser::OpType::InjectWriteError:
+        return fmt::format_to(ctx.out(), "InjectWriteError");
+      case ceph::io_exerciser::OpType::ClearReadErrorInject:
+        return fmt::format_to(ctx.out(), "ClearReadErrorInject");
+      case ceph::io_exerciser::OpType::ClearWriteErrorInject:
+        return fmt::format_to(ctx.out(), "ClearWriteErrorInject");
+      default:
+        ceph_abort_msg("Unknown OpType");
+        return fmt::format_to(ctx.out(), "Unknown OpType");
+    }
+  }
+};
+\ No newline at end of file
diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc
index 44b82260263..4451900b7bb 100644
--- a/src/common/io_exerciser/RadosIo.cc
+++ b/src/common/io_exerciser/RadosIo.cc
@@ -1,300 +1,429 @@
 #include "RadosIo.h"
 
+#include <fmt/format.h>
+#include <json_spirit/json_spirit.h>
+
+#include <ranges>
+
 #include "DataGenerator.h"
+#include "common/ceph_json.h"
+#include "common/json/OSDStructures.h"
 
 using RadosIo = ceph::io_exerciser::RadosIo;
 
-RadosIo::RadosIo(librados::Rados& rados,
-        boost::asio::io_context& asio,
-        const std::string& pool,
-        const std::string& oid,
-        uint64_t block_size,
-        int seed,
-	int threads,
-        ceph::mutex& lock,
-        ceph::condition_variable& cond) :
-  Model(oid, block_size),
-  rados(rados),
-  asio(asio),
-  om(std::make_unique<ObjectModel>(oid, block_size, seed)),
-  db(data_generation::DataGenerator::create_generator(
-      data_generation::GenerationType::HeaderedSeededRandom, *om)),
-  pool(pool),
-  threads(threads),
-  lock(lock),
-  cond(cond),
-  outstanding_io(0)
-{
+RadosIo::RadosIo(librados::Rados& rados, boost::asio::io_context& asio,
+                 const std::string& pool, const std::string& oid,
+                 const std::optional<std::vector<int>>& cached_shard_order,
+                 uint64_t block_size, int seed, int threads, ceph::mutex& lock,
+                 ceph::condition_variable& cond)
+    : Model(oid, block_size),
+      rados(rados),
+      asio(asio),
+      om(std::make_unique<ObjectModel>(oid, block_size, seed)),
+      db(data_generation::DataGenerator::create_generator(
+          data_generation::GenerationType::HeaderedSeededRandom, *om)),
+      pool(pool),
+      cached_shard_order(cached_shard_order),
+      threads(threads),
+      lock(lock),
+      cond(cond),
+      outstanding_io(0) {
   int rc;
   rc = rados.ioctx_create(pool.c_str(), io);
   ceph_assert(rc == 0);
   allow_ec_overwrites(true);
 }
 
-RadosIo::~RadosIo()
-{
-}
+RadosIo::~RadosIo() {}
 
-void RadosIo::start_io()
-{
+void RadosIo::start_io() {
   std::lock_guard l(lock);
   outstanding_io++;
 }
 
-void RadosIo::finish_io()
-{
+void RadosIo::finish_io() {
   std::lock_guard l(lock);
   ceph_assert(outstanding_io > 0);
   outstanding_io--;
   cond.notify_all();
 }
 
-void RadosIo::wait_for_io(int count)
-{
+void RadosIo::wait_for_io(int count) {
   std::unique_lock l(lock);
   while (outstanding_io > count) {
     cond.wait(l);
   }
 }
 
-void RadosIo::allow_ec_overwrites(bool allow)
-{
+void RadosIo::allow_ec_overwrites(bool allow) {
   int rc;
   bufferlist inbl, outbl;
-  std::string cmdstr =
-    "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + "\", \
+  std::string cmdstr = "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool +
+                       "\", \
       \"var\": \"allow_ec_overwrites\", \"val\": \"" +
-    (allow ? "true" : "false") + "\"}";
+                       (allow ? "true" : "false") + "\"}";
   rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
   ceph_assert(rc == 0);
 }
 
-RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1,
-                                  uint64_t offset2, uint64_t length2,
-                                  uint64_t offset3, uint64_t length3 ) :
-  offset1(offset1), length1(length1),
-  offset2(offset2), length2(length2),
-  offset3(offset3), length3(length3)
-{
-
-}
+template <int N>
+RadosIo::AsyncOpInfo<N>::AsyncOpInfo(const std::array<uint64_t, N>& offset,
+                                     const std::array<uint64_t, N>& length)
+    : offset(offset), length(length) {}
 
-bool RadosIo::readyForIoOp(IoOp &op)
-{
-  ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held
+bool RadosIo::readyForIoOp(IoOp& op) {
+  ceph_assert(
+      ceph_mutex_is_locked_by_me(lock));  // Must be called with lock held
   if (!om->readyForIoOp(op)) {
     return false;
   }
-  switch (op.op) {
-  case OpType::Done:
-  case OpType::BARRIER:
-    return outstanding_io == 0;
-  default:
-    return outstanding_io < threads;
+
+  switch (op.getOpType()) {
+    case OpType::Done:
+    case OpType::Barrier:
+      return outstanding_io == 0;
+    default:
+      return outstanding_io < threads;
   }
 }
 
-void RadosIo::applyIoOp(IoOp &op)
-{
-  std::shared_ptr<AsyncOpInfo> op_info;
-
+void RadosIo::applyIoOp(IoOp& op) {
   om->applyIoOp(op);
 
   // If there are thread concurrent I/Os in flight then wait for
   // at least one I/O to complete
-  wait_for_io(threads-1);
-  
-  switch (op.op) {
-  case OpType::Done:
-  [[ fallthrough ]];
-  case OpType::BARRIER:
-    // Wait for all outstanding I/O to complete
-    wait_for_io(0);
-    break;    
-
-  case OpType::CREATE:
-    {
+  wait_for_io(threads - 1);
+
+  switch (op.getOpType()) {
+    case OpType::Done:
+      [[fallthrough]];
+    case OpType::Barrier:
+      // Wait for all outstanding I/O to complete
+      wait_for_io(0);
+      break;
+
+    case OpType::Create: {
       start_io();
-      op_info = std::make_shared<AsyncOpInfo>(0, op.length1);
-      op_info->bl1 = db->generate_data(0, op.length1);
-      op_info->wop.write_full(op_info->bl1);
-      auto create_cb = [this] (boost::system::error_code ec,
-                               version_t ver) {
+      uint64_t opSize = static_cast<CreateOp&>(op).size;
+      std::shared_ptr<AsyncOpInfo<1>> op_info =
+          std::make_shared<AsyncOpInfo<1>>(std::array<uint64_t, 1>{0},
+                                           std::array<uint64_t, 1>{opSize});
+      op_info->bufferlist[0] = db->generate_data(0, opSize);
+      op_info->wop.write_full(op_info->bufferlist[0]);
+      auto create_cb = [this](boost::system::error_code ec, version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
-      librados::async_operate(asio, io, oid,
-                              &op_info->wop, 0, nullptr, create_cb);
+      librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr,
+                              create_cb);
+      break;
     }
-    break;
 
-  case OpType::REMOVE:
-    {
+    case OpType::Remove: {
       start_io();
-      op_info = std::make_shared<AsyncOpInfo>();
+      auto op_info = std::make_shared<AsyncOpInfo<0>>();
       op_info->wop.remove();
-      auto remove_cb = [this] (boost::system::error_code ec,
-                               version_t ver) {
+      auto remove_cb = [this](boost::system::error_code ec, version_t ver) {
         ceph_assert(ec == boost::system::errc::success);
         finish_io();
       };
-      librados::async_operate(asio, io, oid,
-                              &op_info->wop, 0, nullptr, remove_cb);
+      librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr,
+                              remove_cb);
+      break;
     }
-    break;
+    case OpType::Read:
+      [[fallthrough]];
+    case OpType::Read2:
+      [[fallthrough]];
+    case OpType::Read3:
+      [[fallthrough]];
+    case OpType::Write:
+      [[fallthrough]];
+    case OpType::Write2:
+      [[fallthrough]];
+    case OpType::Write3:
+      [[fallthrough]];
+    case OpType::FailedWrite:
+      [[fallthrough]];
+    case OpType::FailedWrite2:
+      [[fallthrough]];
+    case OpType::FailedWrite3:
+      applyReadWriteOp(op);
+      break;
+    case OpType::InjectReadError:
+      [[fallthrough]];
+    case OpType::InjectWriteError:
+      [[fallthrough]];
+    case OpType::ClearReadErrorInject:
+      [[fallthrough]];
+    case OpType::ClearWriteErrorInject:
+      applyInjectOp(op);
+      break;
+    default:
+      ceph_abort_msg("Unrecognised Op");
+      break;
+  }
+}
 
-  case OpType::READ:
-    {
-      start_io();
-      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1);
-      op_info->rop.read(op.offset1 * block_size,
-                        op.length1 * block_size,
-                        &op_info->bl1, nullptr);
-      auto read_cb = [this, op_info] (boost::system::error_code ec,
-                                      version_t ver,
-                                      bufferlist bl) {
-        ceph_assert(ec == boost::system::errc::success);
-        ceph_assert(db->validate(op_info->bl1,
-                                 op_info->offset1,
-                                 op_info->length1));
-        finish_io();
-      };
-      librados::async_operate(asio, io, oid,
-                              &op_info->rop, 0, nullptr, read_cb);
-      num_io++;
+void RadosIo::applyReadWriteOp(IoOp& op) {
+  auto applyReadOp = [this]<OpType opType, int N>(
+                         ReadWriteOp<opType, N> readOp) {
+    auto op_info =
+        std::make_shared<AsyncOpInfo<N>>(readOp.offset, readOp.length);
+
+    for (int i = 0; i < N; i++) {
+      op_info->rop.read(readOp.offset[i] * block_size,
+                        readOp.length[i] * block_size, &op_info->bufferlist[i],
+                        nullptr);
     }
-    break;
+    auto read_cb = [this, op_info](boost::system::error_code ec, version_t ver,
+                                   bufferlist bl) {
+      ceph_assert(ec == boost::system::errc::success);
+      for (int i = 0; i < N; i++) {
+        ceph_assert(db->validate(op_info->bufferlist[i], op_info->offset[i],
+                                 op_info->length[i]));
+      }
+      finish_io();
+    };
+    librados::async_operate(asio, io, oid, &op_info->rop, 0, nullptr, read_cb);
+    num_io++;
+  };
 
-  case OpType::READ2:
-    {
-      start_io();
-      op_info = std::make_shared<AsyncOpInfo>(op.offset1,
-                                              op.length1,
-                                              op.offset2,
-                                              op.length2);
-
-      op_info->rop.read(op.offset1 * block_size,
-                        op.length1 * block_size,
-                        &op_info->bl1, nullptr);
-      op_info->rop.read(op.offset2 * block_size,
-                    op.length2 * block_size,
-                    &op_info->bl2, nullptr);
-      auto read2_cb = [this, op_info] (boost::system::error_code ec,
-                                       version_t ver,
-                                       bufferlist bl) {
-        ceph_assert(ec == boost::system::errc::success);
-        ceph_assert(db->validate(op_info->bl1,
-                                 op_info->offset1,
-                                 op_info->length1));
-        ceph_assert(db->validate(op_info->bl2,
-                                 op_info->offset2,
-                                 op_info->length2));
-        finish_io();
-      };
-      librados::async_operate(asio, io, oid,
-                              &op_info->rop, 0, nullptr, read2_cb);
-      num_io++;
+  auto applyWriteOp = [this]<OpType opType, int N>(
+                          ReadWriteOp<opType, N> writeOp) {
+    auto op_info =
+        std::make_shared<AsyncOpInfo<N>>(writeOp.offset, writeOp.length);
+    for (int i = 0; i < N; i++) {
+      op_info->bufferlist[i] =
+          db->generate_data(writeOp.offset[i], writeOp.length[i]);
+      op_info->wop.write(writeOp.offset[i] * block_size,
+                         op_info->bufferlist[i]);
     }
-    break;
+    auto write_cb = [this](boost::system::error_code ec, version_t ver) {
+      ceph_assert(ec == boost::system::errc::success);
+      finish_io();
+    };
+    librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, write_cb);
+    num_io++;
+  };
 
-  case OpType::READ3:
-    {
-      start_io();
-      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
-                                              op.offset2, op.length2,
-                                              op.offset3, op.length3);
-      op_info->rop.read(op.offset1 * block_size,
-                    op.length1 * block_size,
-                    &op_info->bl1, nullptr);
-      op_info->rop.read(op.offset2 * block_size,
-                    op.length2 * block_size,
-                    &op_info->bl2, nullptr);
-      op_info->rop.read(op.offset3 * block_size,
-                    op.length3 * block_size,
-                    &op_info->bl3, nullptr);
-      auto read3_cb = [this, op_info] (boost::system::error_code ec,
-                                       version_t ver,
-                                       bufferlist bl) {
-        ceph_assert(ec == boost::system::errc::success);
-        ceph_assert(db->validate(op_info->bl1,
-                                 op_info->offset1,
-                                 op_info->length1));
-        ceph_assert(db->validate(op_info->bl2,
-                                 op_info->offset2,
-                                 op_info->length2));
-        ceph_assert(db->validate(op_info->bl3,
-                                 op_info->offset3,
-                                 op_info->length3));
-        finish_io();
-      };
-      librados::async_operate(asio, io, oid,
-                              &op_info->rop, 0, nullptr, read3_cb);
-      num_io++;
+  auto applyFailedWriteOp = [this]<OpType opType, int N>(
+                                ReadWriteOp<opType, N> writeOp) {
+    auto op_info =
+        std::make_shared<AsyncOpInfo<N>>(writeOp.offset, writeOp.length);
+    for (int i = 0; i < N; i++) {
+      op_info->bufferlist[i] =
+          db->generate_data(writeOp.offset[i], writeOp.length[i]);
+      op_info->wop.write(writeOp.offset[i] * block_size,
+                         op_info->bufferlist[i]);
     }
-    break;
+    auto write_cb = [this, writeOp](boost::system::error_code ec,
+                                    version_t ver) {
+      ceph_assert(ec != boost::system::errc::success);
+      finish_io();
+    };
+    librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, write_cb);
+    num_io++;
+  };
 
-  case OpType::WRITE:
-    {
+  switch (op.getOpType()) {
+    case OpType::Read: {
       start_io();
-      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1);
-      op_info->bl1 = db->generate_data(op.offset1, op.length1);
-
-      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
-      auto write_cb = [this] (boost::system::error_code ec,
-                              version_t ver) {
-        ceph_assert(ec == boost::system::errc::success);
-        finish_io();
-      };
-      librados::async_operate(asio, io, oid,
-                              &op_info->wop, 0, nullptr, write_cb);
-      num_io++;
+      SingleReadOp& readOp = static_cast<SingleReadOp&>(op);
+      applyReadOp(readOp);
+      break;
     }
-    break;
-
-  case OpType::WRITE2:
-    {
+    case OpType::Read2: {
       start_io();
-      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
-                                              op.offset2, op.length2);
-      op_info->bl1 = db->generate_data(op.offset1, op.length1);
-      op_info->bl2 = db->generate_data(op.offset2, op.length2);
-      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
-      op_info->wop.write(op.offset2 * block_size, op_info->bl2);
-      auto write2_cb = [this] (boost::system::error_code ec,
-                               version_t ver) {
-        ceph_assert(ec == boost::system::errc::success);
-        finish_io();
-      };
-      librados::async_operate(asio, io, oid,
-                              &op_info->wop, 0, nullptr, write2_cb);
-      num_io++;
+      DoubleReadOp& readOp = static_cast<DoubleReadOp&>(op);
+      applyReadOp(readOp);
+      break;
+    }
+    case OpType::Read3: {
+      start_io();
+      TripleReadOp& readOp = static_cast<TripleReadOp&>(op);
+      applyReadOp(readOp);
+      break;
+    }
+    case OpType::Write: {
+      start_io();
+      SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op);
+      applyWriteOp(writeOp);
+      break;
+    }
+    case OpType::Write2: {
+      start_io();
+      DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op);
+      applyWriteOp(writeOp);
+      break;
+    }
+    case OpType::Write3: {
+      start_io();
+      TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op);
+      applyWriteOp(writeOp);
+      break;
     }
-    break;
 
-  case OpType::WRITE3:
-    {
+    case OpType::FailedWrite: {
       start_io();
-      op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1,
-                                              op.offset2, op.length2,
-                                              op.offset3, op.length3);
-      op_info->bl1 = db->generate_data(op.offset1, op.length1);
-      op_info->bl2 = db->generate_data(op.offset2, op.length2);
-      op_info->bl3 = db->generate_data(op.offset3, op.length3);
-      op_info->wop.write(op.offset1 * block_size, op_info->bl1);
-      op_info->wop.write(op.offset2 * block_size, op_info->bl2);
-      op_info->wop.write(op.offset3 * block_size, op_info->bl3);
-      auto write3_cb = [this] (boost::system::error_code ec,
-                               version_t ver) {
-        ceph_assert(ec == boost::system::errc::success);
-        finish_io();
-      };
-      librados::async_operate(asio, io, oid,
-                              &op_info->wop, 0, nullptr, write3_cb);
-      num_io++;
+      SingleFailedWriteOp& writeOp = static_cast<SingleFailedWriteOp&>(op);
+      applyFailedWriteOp(writeOp);
+      break;
+    }
+    case OpType::FailedWrite2: {
+      start_io();
+      DoubleFailedWriteOp& writeOp = static_cast<DoubleFailedWriteOp&>(op);
+      applyFailedWriteOp(writeOp);
+      break;
+    }
+    case OpType::FailedWrite3: {
+      start_io();
+      TripleFailedWriteOp& writeOp = static_cast<TripleFailedWriteOp&>(op);
+      applyFailedWriteOp(writeOp);
+      break;
     }
-    break;
 
-  default:
-    break;
+    default:
+      ceph_abort_msg(
+          fmt::format("Unsupported Read/Write operation ({})", op.getOpType()));
+      break;
   }
 }
+
+void RadosIo::applyInjectOp(IoOp& op) {
+  bufferlist osdmap_inbl, inject_inbl, osdmap_outbl, inject_outbl;
+  auto formatter = std::make_unique<JSONFormatter>(false);
+  std::ostringstream oss;
+
+  int osd = -1;
+  std::vector<int> shard_order;
+
+  ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, get_oid(), ""};
+  encode_json("OSDMapRequest", osdMapRequest, formatter.get());
+  formatter->flush(oss);
+  int rc = rados.mon_command(oss.str(), osdmap_inbl, &osdmap_outbl, nullptr);
+  ceph_assert(rc == 0);
+
+  JSONParser p;
+  bool success = p.parse(osdmap_outbl.c_str(), osdmap_outbl.length());
+  ceph_assert(success);
+
+  ceph::messaging::osd::OSDMapReply reply;
+  reply.decode_json(&p);
+
+  osd = reply.acting_primary;
+  shard_order = reply.acting;
+
+  switch (op.getOpType()) {
+    case OpType::InjectReadError: {
+      InjectReadErrorOp& errorOp = static_cast<InjectReadErrorOp&>(op);
+
+      if (errorOp.type == 0) {
+        ceph::messaging::osd::InjectECErrorRequest<InjectOpType::ReadEIO>
+            injectErrorRequest{pool,         oid,          errorOp.shard,
+                               errorOp.type, errorOp.when, errorOp.duration};
+        encode_json("InjectECErrorRequest", injectErrorRequest,
+                    formatter.get());
+      } else if (errorOp.type == 1) {
+        ceph::messaging::osd::InjectECErrorRequest<
+            InjectOpType::ReadMissingShard>
+            injectErrorRequest{pool,         oid,          errorOp.shard,
+                               errorOp.type, errorOp.when, errorOp.duration};
+        encode_json("InjectECErrorRequest", injectErrorRequest,
+                    formatter.get());
+      } else {
+        ceph_abort_msg("Unsupported inject type");
+      }
+      formatter->flush(oss);
+      int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl,
+                                 nullptr);
+      ceph_assert(rc == 0);
+      break;
+    }
+    case OpType::InjectWriteError: {
+      InjectWriteErrorOp& errorOp = static_cast<InjectWriteErrorOp&>(op);
+
+      if (errorOp.type == 0) {
+        ceph::messaging::osd::InjectECErrorRequest<
+            InjectOpType::WriteFailAndRollback>
+            injectErrorRequest{pool,         oid,          errorOp.shard,
+                               errorOp.type, errorOp.when, errorOp.duration};
+        encode_json("InjectECErrorRequest", injectErrorRequest,
+                    formatter.get());
+      } else if (errorOp.type == 3) {
+        ceph::messaging::osd::InjectECErrorRequest<InjectOpType::WriteOSDAbort>
+            injectErrorRequest{pool,         oid,          errorOp.shard,
+                               errorOp.type, errorOp.when, errorOp.duration};
+        encode_json("InjectECErrorRequest", injectErrorRequest,
+                    formatter.get());
+
+        // This inject is sent directly to the shard we want to inject the error
+        // on
+        osd = shard_order[errorOp.shard];
+      } else {
+        ceph_abort("Unsupported inject type");
+      }
+
+      formatter->flush(oss);
+      int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl,
+                                 nullptr);
+      ceph_assert(rc == 0);
+      break;
+    }
+    case OpType::ClearReadErrorInject: {
+      ClearReadErrorInjectOp& errorOp =
+          static_cast<ClearReadErrorInjectOp&>(op);
+
+      if (errorOp.type == 0) {
+        ceph::messaging::osd::InjectECClearErrorRequest<InjectOpType::ReadEIO>
+            clearErrorInject{pool, oid, errorOp.shard, errorOp.type};
+        encode_json("InjectECClearErrorRequest", clearErrorInject,
+                    formatter.get());
+      } else if (errorOp.type == 1) {
+        ceph::messaging::osd::InjectECClearErrorRequest<
+            InjectOpType::ReadMissingShard>
+            clearErrorInject{pool, oid, errorOp.shard, errorOp.type};
+        encode_json("InjectECClearErrorRequest", clearErrorInject,
+                    formatter.get());
+      } else {
+        ceph_abort("Unsupported inject type");
+      }
+
+      formatter->flush(oss);
+      int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl,
+                                 nullptr);
+      ceph_assert(rc == 0);
+      break;
+    }
+    case OpType::ClearWriteErrorInject: {
+      ClearReadErrorInjectOp& errorOp =
+          static_cast<ClearReadErrorInjectOp&>(op);
+
+      if (errorOp.type == 0) {
+        ceph::messaging::osd::InjectECClearErrorRequest<
+            InjectOpType::WriteFailAndRollback>
+            clearErrorInject{pool, oid, errorOp.shard, errorOp.type};
+        encode_json("InjectECClearErrorRequest", clearErrorInject,
+                    formatter.get());
+      } else if (errorOp.type == 3) {
+        ceph::messaging::osd::InjectECClearErrorRequest<
+            InjectOpType::WriteOSDAbort>
+            clearErrorInject{pool, oid, errorOp.shard, errorOp.type};
+        encode_json("InjectECClearErrorRequest", clearErrorInject,
+                    formatter.get());
+      } else {
+        ceph_abort("Unsupported inject type");
+      }
+
+      formatter->flush(oss);
+      int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl,
+                                 nullptr);
+      ceph_assert(rc == 0);
+      break;
+    }
+    default:
+      ceph_abort_msg(
+          fmt::format("Unsupported inject operation ({})", op.getOpType()));
+      break;
+  }
+}
+\ No newline at end of file
diff --git a/src/common/io_exerciser/RadosIo.h b/src/common/io_exerciser/RadosIo.h
index 179c5bba3ae..a5c66ad4768 100644
--- a/src/common/io_exerciser/RadosIo.h
+++ b/src/common/io_exerciser/RadosIo.h
@@ -10,71 +10,65 @@
  *   in the object. Uses DataBuffer to create and validate
  *   data buffers. When there are not barrier I/Os this may
  *   issue multiple async I/Os in parallel.
- * 
+ *
  */
 
 namespace ceph {
-  namespace io_exerciser {
-    namespace data_generation {
-      class DataGenerator;
-    }
-    
-    class RadosIo: public Model {
-    protected:
-      librados::Rados& rados;
-      boost::asio::io_context& asio;
-      std::unique_ptr<ObjectModel> om;
-      std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db;
-      std::string pool;
-      int threads;
-      ceph::mutex& lock;
-      ceph::condition_variable& cond;
-      librados::IoCtx io;
-      int outstanding_io;
+namespace io_exerciser {
+namespace data_generation {
+class DataGenerator;
+}
+
+class RadosIo : public Model {
+ protected:
+  librados::Rados& rados;
+  boost::asio::io_context& asio;
+  std::unique_ptr<ObjectModel> om;
+  std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db;
+  std::string pool;
+  std::optional<std::vector<int>> cached_shard_order;
+  int threads;
+  ceph::mutex& lock;
+  ceph::condition_variable& cond;
+  librados::IoCtx io;
+  int outstanding_io;
+
+  void start_io();
+  void finish_io();
+  void wait_for_io(int count);
+
+ public:
+  RadosIo(librados::Rados& rados, boost::asio::io_context& asio,
+          const std::string& pool, const std::string& oid,
+          const std::optional<std::vector<int>>& cached_shard_order,
+          uint64_t block_size, int seed, int threads, ceph::mutex& lock,
+          ceph::condition_variable& cond);
 
-      void start_io();
-      void finish_io();
-      void wait_for_io(int count);
-      
-    public:
-      RadosIo(librados::Rados& rados,
-              boost::asio::io_context& asio,
-              const std::string& pool,
-              const std::string& oid,
-              uint64_t block_size,
-              int seed,
-              int threads,
-              ceph::mutex& lock,
-              ceph::condition_variable& cond);
+  ~RadosIo();
 
-      ~RadosIo();
+  void allow_ec_overwrites(bool allow);
 
-      void allow_ec_overwrites(bool allow);
+  template <int N>
+  class AsyncOpInfo {
+   public:
+    librados::ObjectReadOperation rop;
+    librados::ObjectWriteOperation wop;
+    std::array<ceph::bufferlist, N> bufferlist;
+    std::array<uint64_t, N> offset;
+    std::array<uint64_t, N> length;
 
-      class AsyncOpInfo {
-      public:
-        librados::ObjectReadOperation rop;
-        librados::ObjectWriteOperation wop;
-        ceph::buffer::list bl1;
-        ceph::buffer::list bl2;
-        ceph::buffer::list bl3;
-        uint64_t offset1;
-        uint64_t length1;
-        uint64_t offset2;
-        uint64_t length2;
-        uint64_t offset3;
-        uint64_t length3;
+    AsyncOpInfo(const std::array<uint64_t, N>& offset = {},
+                const std::array<uint64_t, N>& length = {});
+    ~AsyncOpInfo() = default;
+  };
 
-        AsyncOpInfo(uint64_t offset1 = 0, uint64_t length1 = 0,
-                uint64_t offset2 = 0, uint64_t length2 = 0,
-                uint64_t offset3 = 0, uint64_t length3 = 0 );
-        ~AsyncOpInfo() = default;
-      };
+  // Must be called with lock held
+  bool readyForIoOp(IoOp& op);
+  void applyIoOp(IoOp& op);
 
-      // Must be called with lock held
-      bool readyForIoOp(IoOp& op);
-      
-      void applyIoOp(IoOp& op);
-    };
-  }
-}
-\ No newline at end of file
+ private:
+  void applyReadWriteOp(IoOp& op);
+  void applyInjectOp(IoOp& op);
+};
+}  // namespace io_exerciser
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/json/BalancerStructures.cc b/src/common/json/BalancerStructures.cc
new file mode 100644
index 00000000000..48dfb843761
--- /dev/null
+++ b/src/common/json/BalancerStructures.cc
@@ -0,0 +1,38 @@
+#include "BalancerStructures.h"
+
+#include "common/ceph_json.h"
+
+using namespace ceph::messaging::balancer;
+
+void BalancerOffRequest::dump(Formatter* f) const {
+  encode_json("prefix", "balancer off", f);
+}
+
+void BalancerOffRequest::decode_json(JSONObj* obj) {}
+
+void BalancerStatusRequest::dump(Formatter* f) const {
+  encode_json("prefix", "balancer status", f);
+}
+
+void BalancerStatusRequest::decode_json(JSONObj* obj) {}
+
+void BalancerStatusReply::dump(Formatter* f) const {
+  encode_json("active", active, f);
+  encode_json("last_optimization_duration", last_optimization_duration, f);
+  encode_json("last_optimization_started", last_optimization_started, f);
+  encode_json("mode", mode, f);
+  encode_json("no_optimization_needed", no_optimization_needed, f);
+  encode_json("optimize_result", optimize_result, f);
+}
+
+void BalancerStatusReply::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("active", active, obj);
+  JSONDecoder::decode_json("last_optimization_duration",
+                           last_optimization_duration, obj);
+  JSONDecoder::decode_json("last_optimization_started",
+                           last_optimization_started, obj);
+  JSONDecoder::decode_json("mode", mode, obj);
+  JSONDecoder::decode_json("no_optimization_needed", no_optimization_needed,
+                           obj);
+  JSONDecoder::decode_json("optimize_result", optimize_result, obj);
+}
+\ No newline at end of file
diff --git a/src/common/json/BalancerStructures.h b/src/common/json/BalancerStructures.h
new file mode 100644
index 00000000000..bbf5c748eb3
--- /dev/null
+++ b/src/common/json/BalancerStructures.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <string>
+
+#include "include/types.h"
+
+class JSONObj;
+
+namespace ceph {
+namespace messaging {
+namespace balancer {
+struct BalancerOffRequest {
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct BalancerStatusRequest {
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct BalancerStatusReply {
+  bool active;
+  std::string last_optimization_duration;
+  std::string last_optimization_started;
+  std::string mode;
+  bool no_optimization_needed;
+  std::string optimize_result;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+}  // namespace balancer
+}  // namespace messaging
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/json/CMakeLists.txt b/src/common/json/CMakeLists.txt
new file mode 100644
index 00000000000..1497daf93db
--- /dev/null
+++ b/src/common/json/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(json_structures STATIC
+                BalancerStructures.cc ConfigStructures.cc OSDStructures.cc)
+
+    target_link_libraries(json_structures global)
+\ No newline at end of file
diff --git a/src/common/json/ConfigStructures.cc b/src/common/json/ConfigStructures.cc
new file mode 100644
index 00000000000..651278d002a
--- /dev/null
+++ b/src/common/json/ConfigStructures.cc
@@ -0,0 +1,20 @@
+#include "ConfigStructures.h"
+
+#include "common/ceph_json.h"
+
+using namespace ceph::messaging::config;
+
+void ConfigSetRequest::dump(Formatter* f) const {
+  encode_json("prefix", "config set", f);
+  encode_json("who", who, f);
+  encode_json("name", name, f);
+  encode_json("value", value, f);
+  encode_json("force", force, f);
+}
+
+void ConfigSetRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("who", who, obj);
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("value", value, obj);
+  JSONDecoder::decode_json("force", force, obj);
+}
+\ No newline at end of file
diff --git a/src/common/json/ConfigStructures.h b/src/common/json/ConfigStructures.h
new file mode 100644
index 00000000000..554229d75f4
--- /dev/null
+++ b/src/common/json/ConfigStructures.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include "include/types.h"
+
+class JSONObj;
+
+namespace ceph {
+namespace messaging {
+namespace config {
+struct ConfigSetRequest {
+  std::string who;
+  std::string name;
+  std::string value;
+  std::optional<bool> force;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+}  // namespace config
+}  // namespace messaging
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/json/OSDStructures.cc b/src/common/json/OSDStructures.cc
new file mode 100644
index 00000000000..aaac5f6e169
--- /dev/null
+++ b/src/common/json/OSDStructures.cc
@@ -0,0 +1,150 @@
+#include "OSDStructures.h"
+
+#include "common/ceph_json.h"
+#include "common/io_exerciser/OpType.h"
+
+using namespace ceph::messaging::osd;
+
+void OSDMapRequest::dump(Formatter* f) const {
+  encode_json("prefix", "osd map", f);
+  encode_json("pool", pool, f);
+  encode_json("object", object, f);
+  encode_json("nspace", nspace, f);
+  encode_json("format", format, f);
+}
+
+void OSDMapRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("pool", pool, obj);
+  JSONDecoder::decode_json("object", object, obj);
+  JSONDecoder::decode_json("nspace", nspace, obj);
+  JSONDecoder::decode_json("format", format, obj);
+}
+
+void OSDMapReply::dump(Formatter* f) const {
+  encode_json("epoch", epoch, f);
+  encode_json("pool", pool, f);
+  encode_json("pool_id", pool_id, f);
+  encode_json("objname", objname, f);
+  encode_json("raw_pgid", raw_pgid, f);
+  encode_json("pgid", pgid, f);
+  encode_json("up", up, f);
+  encode_json("up_primary", up_primary, f);
+  encode_json("acting", acting, f);
+  encode_json("acting_primary", acting_primary, f);
+}
+
+void OSDMapReply::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("epoch", epoch, obj);
+  JSONDecoder::decode_json("pool", pool, obj);
+  JSONDecoder::decode_json("pool_id", pool_id, obj);
+  JSONDecoder::decode_json("objname", objname, obj);
+  JSONDecoder::decode_json("raw_pgid", raw_pgid, obj);
+  JSONDecoder::decode_json("pgid", pgid, obj);
+  JSONDecoder::decode_json("up", up, obj);
+  JSONDecoder::decode_json("up_primary", up_primary, obj);
+  JSONDecoder::decode_json("acting", acting, obj);
+  JSONDecoder::decode_json("acting_primary", acting_primary, obj);
+}
+
+void OSDPoolGetRequest::dump(Formatter* f) const {
+  encode_json("prefix", "osd pool get", f);
+  encode_json("pool", pool, f);
+  encode_json("var", var, f);
+  encode_json("format", format, f);
+}
+
+void OSDPoolGetRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("pool", pool, obj);
+  JSONDecoder::decode_json("var", var, obj);
+  JSONDecoder::decode_json("format", format, obj);
+}
+
+void OSDPoolGetReply::dump(Formatter* f) const {
+  encode_json("erasure_code_profile", erasure_code_profile, f);
+}
+
+void OSDPoolGetReply::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("erasure_code_profile", erasure_code_profile, obj);
+}
+
+void OSDECProfileGetRequest::dump(Formatter* f) const {
+  encode_json("prefix", "osd pool get", f);
+  encode_json("name", name, f);
+  encode_json("format", format, f);
+}
+
+void OSDECProfileGetRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("format", format, obj);
+}
+
+void OSDECProfileGetReply::dump(Formatter* f) const {
+  encode_json("crush-device-class", crush_device_class, f);
+  encode_json("crush-failure-domain", crush_failure_domain, f);
+  encode_json("crush-num-failure-domains", crush_num_failure_domains, f);
+  encode_json("crush-osds-per-failure-domain", crush_osds_per_failure_domain,
+              f);
+  encode_json("crush-root", crush_root, f);
+  encode_json("jerasure-per-chunk-alignment", jerasure_per_chunk_alignment, f);
+  encode_json("k", k, f);
+  encode_json("m", m, f);
+  encode_json("plugin", plugin, f);
+  encode_json("technique", technique, f);
+  encode_json("w", w, f);
+}
+
+void OSDECProfileGetReply::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("crush-device-class", crush_device_class, obj);
+  JSONDecoder::decode_json("crush-failure-domain", crush_failure_domain, obj);
+  JSONDecoder::decode_json("crush-num-failure-domains",
+                           crush_num_failure_domains, obj);
+  JSONDecoder::decode_json("crush-osds-per-failure-domain",
+                           crush_osds_per_failure_domain, obj);
+  JSONDecoder::decode_json("crush-root", crush_root, obj);
+  JSONDecoder::decode_json("jerasure-per-chunk-alignment",
+                           jerasure_per_chunk_alignment, obj);
+  JSONDecoder::decode_json("k", k, obj);
+  JSONDecoder::decode_json("m", m, obj);
+  JSONDecoder::decode_json("plugin", plugin, obj);
+  JSONDecoder::decode_json("technique", technique, obj);
+  JSONDecoder::decode_json("w", w, obj);
+}
+
+void OSDECProfileSetRequest::dump(Formatter* f) const {
+  encode_json("prefix", "osd erasure-code-profile set", f);
+  encode_json("name", name, f);
+  encode_json("profile", profile, f);
+}
+
+void OSDECProfileSetRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("profile", profile, obj);
+}
+
+void OSDECPoolCreateRequest::dump(Formatter* f) const {
+  encode_json("prefix", "osd pool create", f);
+  encode_json("pool", pool, f);
+  encode_json("pool_type", pool_type, f);
+  encode_json("pg_num", pg_num, f);
+  encode_json("pgp_num", pgp_num, f);
+  encode_json("erasure_code_profile", erasure_code_profile, f);
+}
+
+void OSDECPoolCreateRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("pool", pool, obj);
+  JSONDecoder::decode_json("pool_type", pool_type, obj);
+  JSONDecoder::decode_json("pg_num", pg_num, obj);
+  JSONDecoder::decode_json("pgp_num", pgp_num, obj);
+  JSONDecoder::decode_json("erasure_code_profile", erasure_code_profile, obj);
+}
+
+void OSDSetRequest::dump(Formatter* f) const {
+  encode_json("prefix", "osd set", f);
+  encode_json("key", key, f);
+  encode_json("yes_i_really_mean_it", yes_i_really_mean_it, f);
+}
+
+void OSDSetRequest::decode_json(JSONObj* obj) {
+  JSONDecoder::decode_json("key", key, obj);
+  JSONDecoder::decode_json("yes_i_really_mean_it", yes_i_really_mean_it, obj);
+}
+\ No newline at end of file
diff --git a/src/common/json/OSDStructures.h b/src/common/json/OSDStructures.h
new file mode 100644
index 00000000000..3e4528a099f
--- /dev/null
+++ b/src/common/json/OSDStructures.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "common/ceph_json.h"
+#include "common/io_exerciser/OpType.h"
+#include "include/types.h"
+
+class JSONObj;
+
+namespace ceph {
+namespace messaging {
+namespace osd {
+struct OSDMapRequest {
+  std::string pool;
+  std::string object;
+  std::string nspace;
+  std::string format = "json";
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDMapReply {
+  epoch_t epoch;
+  std::string pool;
+  uint64_t pool_id;
+  std::string objname;
+  std::string raw_pgid;
+  std::string pgid;
+  std::vector<int> up;
+  int up_primary;
+  std::vector<int> acting;
+  int acting_primary;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDPoolGetRequest {
+  std::string pool;
+  std::string var = "erasure_code_profile";
+  std::string format = "json";
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDPoolGetReply {
+  std::string erasure_code_profile;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDECProfileGetRequest {
+  std::string name;
+  std::string format = "json";
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDECProfileGetReply {
+  std::string crush_device_class;
+  std::string crush_failure_domain;
+  int crush_num_failure_domains;
+  int crush_osds_per_failure_domain;
+  std::string crush_root;
+  bool jerasure_per_chunk_alignment;
+  int k;
+  int m;
+  std::string plugin;
+  std::string technique;
+  std::string w;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDECProfileSetRequest {
+  std::string name;
+  std::vector<std::string> profile;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDECPoolCreateRequest {
+  std::string pool;
+  std::string pool_type;
+  int pg_num;
+  int pgp_num;
+  std::string erasure_code_profile;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct OSDSetRequest {
+  std::string key;
+  std::optional<bool> yes_i_really_mean_it = std::nullopt;
+
+  void dump(Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+// These structures are sent directly to the relevant OSD
+// rather than the monitor
+template <io_exerciser::InjectOpType op_type>
+struct InjectECErrorRequest {
+  std::string pool;
+  std::string objname;
+  int shardid;
+  std::optional<uint64_t> type;
+  std::optional<uint64_t> when;
+  std::optional<uint64_t> duration;
+
+  void dump(Formatter* f) const {
+    switch (op_type) {
+      case io_exerciser::InjectOpType::ReadEIO:
+        [[fallthrough]];
+      case io_exerciser::InjectOpType::ReadMissingShard:
+        ::encode_json("prefix", "injectecreaderr", f);
+        break;
+      case io_exerciser::InjectOpType::WriteFailAndRollback:
+        [[fallthrough]];
+      case io_exerciser::InjectOpType::WriteOSDAbort:
+        ::encode_json("prefix", "injectecwriteerr", f);
+        break;
+      default:
+        ceph_abort_msg("Unsupported Inject Type");
+    }
+    ::encode_json("pool", pool, f);
+    ::encode_json("objname", objname, f);
+    ::encode_json("shardid", shardid, f);
+    ::encode_json("type", type, f);
+    ::encode_json("when", when, f);
+    ::encode_json("duration", duration, f);
+  }
+  void decode_json(JSONObj* obj) {
+    JSONDecoder::decode_json("pool", pool, obj);
+    JSONDecoder::decode_json("objname", objname, obj);
+    JSONDecoder::decode_json("shardid", shardid, obj);
+    JSONDecoder::decode_json("type", type, obj);
+    JSONDecoder::decode_json("when", when, obj);
+    JSONDecoder::decode_json("duration", duration, obj);
+  }
+};
+
+template <io_exerciser::InjectOpType op_type>
+struct InjectECClearErrorRequest {
+  std::string pool;
+  std::string objname;
+  int shardid;
+  std::optional<uint64_t> type;
+
+  void dump(Formatter* f) const {
+    switch (op_type) {
+      case io_exerciser::InjectOpType::ReadEIO:
+        [[fallthrough]];
+      case io_exerciser::InjectOpType::ReadMissingShard:
+        ::encode_json("prefix", "injectecclearreaderr", f);
+        break;
+      case io_exerciser::InjectOpType::WriteFailAndRollback:
+        [[fallthrough]];
+      case io_exerciser::InjectOpType::WriteOSDAbort:
+        ::encode_json("prefix", "injectecclearwriteerr", f);
+        break;
+      default:
+        ceph_abort_msg("Unsupported Inject Type");
+    }
+    ::encode_json("pool", pool, f);
+    ::encode_json("objname", objname, f);
+    ::encode_json("shardid", shardid, f);
+    ::encode_json("type", type, f);
+  }
+  void decode_json(JSONObj* obj) {
+    JSONDecoder::decode_json("pool", pool, obj);
+    JSONDecoder::decode_json("objname", objname, obj);
+    JSONDecoder::decode_json("shardid", shardid, obj);
+    JSONDecoder::decode_json("type", type, obj);
+  }
+};
+}  // namespace osd
+}  // namespace messaging
+}  // namespace ceph
+\ No newline at end of file
diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in
index abef7483bcf..132a4a09e89 100644
--- a/src/common/options/crimson.yaml.in
+++ b/src/common/options/crimson.yaml.in
@@ -6,7 +6,7 @@ options:
   type: str
   level: advanced
   desc: backend type for a Crimson OSD (e.g seastore or bluestore)
-  default: seastore
+  default: bluestore
   enum_values:
   - bluestore
   - seastore
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index 4cbc079a215..03a53cd7cea 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -1745,4 +1745,4 @@ options:
   default: 16
   services:
   - mds
-  min: 8
+  min: 4
diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in
index ab1634bc154..1307030e3fb 100644
--- a/src/common/options/mon.yaml.in
+++ b/src/common/options/mon.yaml.in
@@ -91,6 +91,13 @@ options:
   default: 1000
   services:
   - mon
+- name: mon_nvmeofgw_delete_grace
+  type: secs
+  level: advanced
+  desc: Issue NVMEOF_GATEWAY_DELETING health warning after this amount of time has elapsed 
+  default: 15_min
+  services:
+  - mon
 - name: mon_mgr_inactive_grace
   type: int
   level: advanced
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index d125d7171e0..a0629a15686 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -642,17 +642,24 @@ int get_iface_numa_node(
 bool is_addr_in_subnet(
   CephContext *cct,
   const std::string &networks,
-  const std::string &addr)
+  const entity_addr_t &addr)
 {
   const auto nets = get_str_list(networks);
   ceph_assert(!nets.empty());
-
   unsigned ipv = CEPH_PICK_ADDRESS_IPV4;
-  struct sockaddr_in public_addr;
-  public_addr.sin_family = AF_INET;
-
-  if(inet_pton(AF_INET, addr.c_str(), &public_addr.sin_addr) != 1) {
-    lderr(cct) << "unable to convert chosen address to string: " << addr << dendl;
+  struct sockaddr_in6 public_addr6;
+  struct sockaddr_in public_addr4;
+
+  if (addr.is_ipv4() &&
+      inet_pton(AF_INET, addr.ip_only_to_str().c_str(), &public_addr4.sin_addr) == 1) {
+    public_addr4.sin_family = AF_INET;
+  } else if (addr.is_ipv6() &&
+      inet_pton(AF_INET6, addr.ip_only_to_str().c_str(), &public_addr6.sin6_addr) == 1) {
+    public_addr6.sin6_family = AF_INET6;
+    ipv = CEPH_PICK_ADDRESS_IPV6;
+  } else {
+    std::string_view addr_type = addr.is_ipv4() ? "IPv4" : "IPv6";
+    lderr(cct) << "IP address " << addr << " is not parseable as " << addr_type << dendl;
     return false;
   }
 
@@ -660,10 +667,16 @@ bool is_addr_in_subnet(
     struct ifaddrs ifa;
     memset(&ifa, 0, sizeof(ifa));
     ifa.ifa_next = nullptr;
-    ifa.ifa_addr = (struct sockaddr*)&public_addr;
+    if (addr.is_ipv4()) {
+      ifa.ifa_addr = (struct sockaddr*)&public_addr4;
+    } else if (addr.is_ipv6()) {
+      ifa.ifa_addr = (struct sockaddr*)&public_addr6;
+    }
+
     if(matches_with_net(cct, ifa, net, ipv)) {
       return true;
     }
   }
+  lderr(cct) << "address " << addr << " is not in networks '" << networks << "'" << dendl;
   return false;
 }
diff --git a/src/common/pick_address.h b/src/common/pick_address.h
index 40575d7d155..c28a6037ded 100644
--- a/src/common/pick_address.h
+++ b/src/common/pick_address.h
@@ -98,6 +98,6 @@ int get_iface_numa_node(
 bool is_addr_in_subnet(
   CephContext *cct,
   const std::string &networks,
-  const std::string &addr);
+  const entity_addr_t &addr);
 
 #endif
diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h
index 92d99d332c4..0d73658e709 100644
--- a/src/crimson/common/shared_lru.h
+++ b/src/crimson/common/shared_lru.h
@@ -25,12 +25,17 @@ class SharedLRU {
   SimpleLRU<K, shared_ptr_t, false> cache;
   std::map<K, std::pair<weak_ptr_t, V*>> weak_refs;
 
+  // Once all of the shared pointers are destoryed,
+  // erase the tracked object from the weak_ref map
+  // before actually destorying it
   struct Deleter {
-    SharedLRU<K,V>* cache;
+    SharedLRU<K,V>* shared_lru_ptr;
     const K key;
-    void operator()(V* ptr) {
-      cache->_erase_weak(key);
-      delete ptr;
+    void operator()(V* value_ptr) {
+      if (shared_lru_ptr) {
+        shared_lru_ptr->_erase_weak(key);
+      }
+      delete value_ptr;
     }
   };
   void _erase_weak(const K& key) {
@@ -42,9 +47,19 @@ public:
   {}
   ~SharedLRU() {
     cache.clear();
+
     // initially, we were assuming that no pointer obtained from SharedLRU
     // can outlive the lru itself. However, since going with the interruption
     // concept for handling shutdowns, this is no longer valid.
+    // Moreover, before clearing weak_refs, invalidate each deleter
+    // cache pointer as this SharedLRU is being destoryed.
+    for (const auto& [key, value] : weak_refs) {
+      shared_ptr_t val;
+      val = value.first.lock();
+      auto this_deleter = get_deleter<Deleter>(val);
+      this_deleter->shared_lru_ptr = nullptr;
+    }
+
     weak_refs.clear();
   }
   /**
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
index f390823a8a0..db6decd84f9 100644
--- a/src/crimson/os/alienstore/alien_store.cc
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -141,7 +141,8 @@ seastar::future<> AlienStore::stop()
 AlienStore::base_errorator::future<bool>
 AlienStore::exists(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
     return op_gates.simple_dispatch("exists", [=, this] {
         return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
@@ -212,7 +213,8 @@ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
 AlienStore::list_objects(CollectionRef ch,
                         const ghobject_t& start,
                         const ghobject_t& end,
-                        uint64_t limit) const
+                        uint64_t limit,
+			uint32_t op_flags) const
 {
   logger().debug("{}", __func__);
   assert(tp);
@@ -348,7 +350,8 @@ AlienStore::readv(CollectionRef ch,
 AlienStore::get_attr_errorator::future<ceph::bufferlist>
 AlienStore::get_attr(CollectionRef ch,
                      const ghobject_t& oid,
-                     std::string_view name) const
+                     std::string_view name,
+		     uint32_t op_flags) const
 {
   logger().debug("{}", __func__);
   assert(tp);
@@ -376,7 +379,8 @@ AlienStore::get_attr(CollectionRef ch,
 
 AlienStore::get_attrs_ertr::future<AlienStore::attrs_t>
 AlienStore::get_attrs(CollectionRef ch,
-                      const ghobject_t& oid)
+                      const ghobject_t& oid,
+		      uint32_t op_flags)
 {
   logger().debug("{}", __func__);
   assert(tp);
@@ -397,7 +401,8 @@ AlienStore::get_attrs(CollectionRef ch,
 
 auto AlienStore::omap_get_values(CollectionRef ch,
                                  const ghobject_t& oid,
-                                 const set<string>& keys)
+                                 const set<string>& keys,
+				 uint32_t op_flags)
   -> read_errorator::future<omap_values_t>
 {
   logger().debug("{}", __func__);
@@ -421,7 +426,8 @@ auto AlienStore::omap_get_values(CollectionRef ch,
 
 auto AlienStore::omap_get_values(CollectionRef ch,
                                  const ghobject_t &oid,
-                                 const std::optional<string> &start)
+                                 const std::optional<string> &start,
+				 uint32_t op_flags)
   -> read_errorator::future<std::tuple<bool, omap_values_t>>
 {
   logger().debug("{} with_start", __func__);
@@ -429,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch,
   return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) {
     return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] {
       auto c = static_cast<AlienCollection*>(ch.get());
-      return store->omap_get_values(c->collection, oid, start,
-		                    reinterpret_cast<map<string, bufferlist>*>(&values));
+      return store->omap_iterate(
+        c->collection, oid,
+        ObjectStore::omap_iter_seek_t{
+          .seek_position = start.value_or(std::string{}),
+          // FIXME: classical OSDs begins iteration from LOWER_BOUND
+          // (or UPPER_BOUND if filter_prefix > start). However, these
+          // bits are not implemented yet
+          .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND
+        },
+        [&values]
+        (std::string_view key, std::string_view value) mutable {
+          values[std::string{key}].append(value);
+          // FIXME: there is limit on number of entries yet
+          return ObjectStore::omap_iter_ret_t::NEXT;
+        });
     }).then([&values] (int r)
       -> read_errorator::future<std::tuple<bool, omap_values_t>> {
       if (r == -ENOENT) {
@@ -578,7 +597,8 @@ unsigned AlienStore::get_max_attr_name_length() const
 
 seastar::future<struct stat> AlienStore::stat(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
   assert(tp);
   return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) {
@@ -604,7 +624,8 @@ seastar::future<std::string> AlienStore::get_default_device_class()
 }
 
 auto AlienStore::omap_get_header(CollectionRef ch,
-                                 const ghobject_t& oid)
+                                 const ghobject_t& oid,
+				 uint32_t op_flags)
   -> get_attr_errorator::future<ceph::bufferlist>
 {
   assert(tp);
@@ -630,7 +651,8 @@ AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fie
   CollectionRef ch,
   const ghobject_t& oid,
   uint64_t off,
-  uint64_t len)
+  uint64_t len,
+  uint32_t op_flags)
 {
   assert(tp);
   return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) {
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
index 853585dac9c..1d39411450e 100644
--- a/src/crimson/os/alienstore/alien_store.h
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -36,7 +36,8 @@ public:
 
   base_errorator::future<bool> exists(
     CollectionRef c,
-    const ghobject_t& oid) final;
+    const ghobject_t& oid,
+    uint32_t op_flags = 0) final;
   mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
   read_errorator::future<ceph::bufferlist> read(CollectionRef c,
                                    const ghobject_t& oid,
@@ -49,29 +50,36 @@ public:
 						 uint32_t op_flags = 0) final;
 					      
 
-  get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c,
-                                            const ghobject_t& oid,
-                                            std::string_view name) const final;
-  get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c,
-                                     const ghobject_t& oid) final;
+  get_attr_errorator::future<ceph::bufferlist> get_attr(
+    CollectionRef c,
+    const ghobject_t& oid,
+    std::string_view name,
+    uint32_t op_flags = 0) const final;
+  get_attrs_ertr::future<attrs_t> get_attrs(
+    CollectionRef c,
+    const ghobject_t& oid,
+    uint32_t op_flags = 0) final;
 
   read_errorator::future<omap_values_t> omap_get_values(
     CollectionRef c,
     const ghobject_t& oid,
-    const omap_keys_t& keys) final;
+    const omap_keys_t& keys,
+    uint32_t op_flags = 0) final;
 
   /// Retrieves paged set of values > start (if present)
   read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
     CollectionRef c,           ///< [in] collection
     const ghobject_t &oid,     ///< [in] oid
-    const std::optional<std::string> &start ///< [in] start, empty for begin
+    const std::optional<std::string> &start, ///< [in] start, empty for begin
+    uint32_t op_flags = 0
     ) final; ///< @return <done, values> values.empty() iff done
 
   seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
     CollectionRef c,
     const ghobject_t& start,
     const ghobject_t& end,
-    uint64_t limit) const final;
+    uint64_t limit,
+    uint32_t op_flags = 0) const final;
 
   seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
   seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
@@ -97,16 +105,19 @@ public:
   unsigned get_max_attr_name_length() const final;
   seastar::future<struct stat> stat(
     CollectionRef,
-    const ghobject_t&) final;
+    const ghobject_t&,
+    uint32_t op_flags = 0) final;
   seastar::future<std::string> get_default_device_class() final;
   get_attr_errorator::future<ceph::bufferlist> omap_get_header(
     CollectionRef,
-    const ghobject_t&) final;
+    const ghobject_t&,
+    uint32_t) final;
   read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
     CollectionRef,
     const ghobject_t&,
     uint64_t off,
-    uint64_t len) final;
+    uint64_t len,
+    uint32_t op_flags) final;
 
   FuturizedStore::Shard& get_sharded_store() final {
     return *this;
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
index a8bf514de15..41819fb5eb6 100644
--- a/src/crimson/os/cyanstore/cyan_store.cc
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -208,7 +208,8 @@ CyanStore::Shard::list_objects(
   CollectionRef ch,
   const ghobject_t& start,
   const ghobject_t& end,
-  uint64_t limit) const
+  uint64_t limit,
+  uint32_t op_flags) const
 {
   auto c = static_cast<Collection*>(ch.get());
   logger().debug("{} {} {} {} {}",
@@ -257,7 +258,8 @@ CyanStore::Shard::list_collections()
 CyanStore::Shard::base_errorator::future<bool>
 CyanStore::Shard::exists(
   CollectionRef ch,
-  const ghobject_t &oid)
+  const ghobject_t &oid,
+  uint32_t op_flags)
 {
   auto c = static_cast<Collection*>(ch.get());
   if (!c->exists) {
@@ -333,7 +335,8 @@ CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 CyanStore::Shard::get_attr(
   CollectionRef ch,
   const ghobject_t& oid,
-  std::string_view name) const
+  std::string_view name,
+  uint32_t op_flags) const
 {
   auto c = static_cast<Collection*>(ch.get());
   logger().debug("{} {} {}",
@@ -352,7 +355,8 @@ CyanStore::Shard::get_attr(
 CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t>
 CyanStore::Shard::get_attrs(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
   auto c = static_cast<Collection*>(ch.get());
   logger().debug("{} {} {}",
@@ -367,7 +371,8 @@ CyanStore::Shard::get_attrs(
 auto CyanStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t& oid,
-  const omap_keys_t& keys)
+  const omap_keys_t& keys,
+  uint32_t op_flags)
   -> read_errorator::future<omap_values_t>
 {
   auto c = static_cast<Collection*>(ch.get());
@@ -388,7 +393,8 @@ auto CyanStore::Shard::omap_get_values(
 auto CyanStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const std::optional<string> &start)
+  const std::optional<string> &start,
+  uint32_t op_flags)
   -> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>>
 {
   auto c = static_cast<Collection*>(ch.get());
@@ -409,7 +415,8 @@ auto CyanStore::Shard::omap_get_values(
 
 auto CyanStore::Shard::omap_get_header(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
   -> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 {
   auto c = static_cast<Collection*>(ch.get());
@@ -977,7 +984,8 @@ CyanStore::Shard::fiemap(
   CollectionRef ch,
   const ghobject_t& oid,
   uint64_t off,
-  uint64_t len)
+  uint64_t len,
+  uint32_t op_flags)
 {
   auto c = static_cast<Collection*>(ch.get());
 
@@ -992,7 +1000,8 @@ CyanStore::Shard::fiemap(
 seastar::future<struct stat>
 CyanStore::Shard::stat(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
   auto c = static_cast<Collection*>(ch.get());
   auto o = c->get_object(oid);
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
index e9394991bc2..1d481ef5829 100644
--- a/src/crimson/os/cyanstore/cyan_store.h
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -34,11 +34,13 @@ public:
 
     seastar::future<struct stat> stat(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     base_errorator::future<bool> exists(
       CollectionRef ch,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     read_errorator::future<ceph::bufferlist> read(
       CollectionRef c,
@@ -56,33 +58,39 @@ public:
     get_attr_errorator::future<ceph::bufferlist> get_attr(
       CollectionRef c,
       const ghobject_t& oid,
-      std::string_view name) const final;
+      std::string_view name,
+      uint32_t op_flags = 0) const final;
 
     get_attrs_ertr::future<attrs_t> get_attrs(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
-      const omap_keys_t& keys) final;
+      const omap_keys_t& keys,
+      uint32_t op_flags = 0) final;
 
     read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
-      const std::optional<std::string> &start ///< [in] start, empty for begin
+      const std::optional<std::string> &start, ///< [in] start, empty for begin
+      uint32_t op_flags = 0
       ) final;
 
     get_attr_errorator::future<ceph::bufferlist> omap_get_header(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
     list_objects(
       CollectionRef c,
       const ghobject_t& start,
       const ghobject_t& end,
-      uint64_t limit) const final;
+      uint64_t limit,
+      uint32_t op_flags = 0) const final;
 
     seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
 
@@ -101,7 +109,8 @@ public:
       CollectionRef c,
       const ghobject_t& oid,
       uint64_t off,
-      uint64_t len) final;
+      uint64_t len,
+      uint32_t op_flags) final;
 
     unsigned get_max_attr_name_length() const final;
 
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index 51ef2331014..e7d4c8546de 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -54,7 +54,8 @@ public:
 
     virtual base_errorator::future<bool> exists(
       CollectionRef c,
-      const ghobject_t& oid) = 0;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) = 0;
 
     using get_attr_errorator = crimson::errorator<
       crimson::ct_error::enoent,
@@ -62,42 +63,49 @@ public:
     virtual get_attr_errorator::future<ceph::bufferlist> get_attr(
       CollectionRef c,
       const ghobject_t& oid,
-      std::string_view name) const = 0;
+      std::string_view name,
+      uint32_t op_flags = 0) const = 0;
 
     using get_attrs_ertr = crimson::errorator<
       crimson::ct_error::enoent>;
     using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>;
     virtual get_attrs_ertr::future<attrs_t> get_attrs(
       CollectionRef c,
-      const ghobject_t& oid) = 0;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) = 0;
 
     virtual seastar::future<struct stat> stat(
       CollectionRef c,
-      const ghobject_t& oid) = 0;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) = 0;
 
     using omap_values_t = attrs_t;
     using omap_keys_t = std::set<std::string>;
     virtual read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
-      const omap_keys_t& keys) = 0;
+      const omap_keys_t& keys,
+      uint32_t op_flags = 0) = 0;
 
     using omap_values_paged_t = std::tuple<bool, omap_values_t>;
     virtual read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
-      const std::optional<std::string> &start ///< [in] start, empty for begin
+      const std::optional<std::string> &start, ///< [in] start, empty for begin
+      uint32_t op_flags = 0
       ) = 0; ///< @return <done, values> values.empty() only if done
 
     virtual get_attr_errorator::future<bufferlist> omap_get_header(
       CollectionRef c,
-      const ghobject_t& oid) = 0;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) = 0;
 
     virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
       CollectionRef c,
       const ghobject_t& start,
       const ghobject_t& end,
-      uint64_t limit) const = 0;
+      uint64_t limit,
+      uint32_t op_flags = 0) const = 0;
 
     virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0;
 
@@ -153,7 +161,8 @@ public:
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
-      uint64_t len) = 0;
+      uint64_t len,
+      uint32_t op_flags = 0) = 0;
 
     virtual unsigned get_max_attr_name_length() const = 0;
   };
diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc
index 341c5c5524a..64e6749562e 100644
--- a/src/crimson/os/seastore/async_cleaner.cc
+++ b/src/crimson/os/seastore/async_cleaner.cc
@@ -609,6 +609,7 @@ JournalTrimmerImpl::trim_alloc()
     return extent_callback->with_transaction_intr(
       Transaction::src_t::TRIM_ALLOC,
       "trim_alloc",
+      CACHE_HINT_NOCACHE,
       [this, FNAME](auto &t)
     {
       auto target = get_alloc_tail_target();
@@ -653,6 +654,7 @@ JournalTrimmerImpl::trim_dirty()
     return extent_callback->with_transaction_intr(
       Transaction::src_t::TRIM_DIRTY,
       "trim_dirty",
+      CACHE_HINT_NOCACHE,
       [this, FNAME](auto &t)
     {
       auto target = get_dirty_tail_target();
@@ -1125,6 +1127,7 @@ SegmentCleaner::do_reclaim_space(
     return extent_callback->with_transaction_intr(
       src,
       "clean_reclaim_space",
+      CACHE_HINT_NOCACHE,
       [this, &backref_extents, &pin_list, &reclaimed](auto &t)
     {
       return seastar::do_with(
@@ -1240,6 +1243,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
       return extent_callback->with_transaction_intr(
 	  Transaction::src_t::READ,
 	  "retrieve_from_backref_tree",
+	  CACHE_HINT_NOCACHE,
 	  [this, &weak_read_ret](auto &t) {
 	return backref_manager.get_mappings(
 	  t,
@@ -1506,6 +1510,7 @@ bool SegmentCleaner::check_usage()
   SpaceTrackerIRef tracker(space_tracker->make_empty());
   extent_callback->with_transaction_weak(
       "check_usage",
+      CACHE_HINT_NOCACHE,
       [this, &tracker](auto &t) {
     return backref_manager.scan_mapped_space(
       t,
@@ -1812,6 +1817,7 @@ bool RBMCleaner::check_usage()
   RBMSpaceTracker tracker(rbms);
   extent_callback->with_transaction_weak(
       "check_usage",
+      CACHE_HINT_NOCACHE,
       [this, &tracker, &rbms](auto &t) {
     return backref_manager.scan_mapped_space(
       t,
diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h
index 424247c5bdc..01ab44c4c7c 100644
--- a/src/crimson/os/seastore/async_cleaner.h
+++ b/src/crimson/os/seastore/async_cleaner.h
@@ -299,24 +299,29 @@ public:
   /// Creates empty transaction
   /// weak transaction should be type READ
   virtual TransactionRef create_transaction(
-      Transaction::src_t, const char *name, bool is_weak=false) = 0;
+    Transaction::src_t,
+    const char *name,
+    cache_hint_t cache_hint = CACHE_HINT_TOUCH,
+    bool is_weak=false) = 0;
 
   /// Creates empty transaction with interruptible context
   template <typename Func>
   auto with_transaction_intr(
       Transaction::src_t src,
       const char* name,
+      cache_hint_t cache_hint,
       Func &&f) {
     return do_with_transaction_intr<Func, false>(
-        src, name, std::forward<Func>(f));
+        src, name, cache_hint, std::forward<Func>(f));
   }
 
   template <typename Func>
   auto with_transaction_weak(
       const char* name,
+      cache_hint_t cache_hint,
       Func &&f) {
     return do_with_transaction_intr<Func, true>(
-        Transaction::src_t::READ, name, std::forward<Func>(f)
+        Transaction::src_t::READ, name, cache_hint, std::forward<Func>(f)
     ).handle_error(
       crimson::ct_error::eagain::assert_failure{"unexpected eagain"},
       crimson::ct_error::pass_further_all{}
@@ -385,9 +390,10 @@ private:
   auto do_with_transaction_intr(
       Transaction::src_t src,
       const char* name,
+      cache_hint_t cache_hint,
       Func &&f) {
     return seastar::do_with(
-      create_transaction(src, name, IsWeak),
+      create_transaction(src, name, cache_hint, IsWeak),
       [f=std::forward<Func>(f)](auto &ref_t) mutable {
         return with_trans_intr(
           *ref_t,
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 5898b9bad0a..86f816e1648 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -1483,7 +1483,7 @@ record_t Cache::prepare_record(
     i->state = CachedExtent::extent_state_t::CLEAN;
     assert(i->is_logical());
     i->clear_modified_region();
-    touch_extent(*i, &trans_src);
+    touch_extent(*i, &trans_src, t.get_cache_hint());
     DEBUGT("inplace rewrite ool block is commmitted -- {}", t, *i);
   }
 
@@ -1513,7 +1513,7 @@ record_t Cache::prepare_record(
     if (i->is_dirty()) {
       add_to_dirty(i, &t_src);
     } else {
-      touch_extent(*i, &t_src);
+      touch_extent(*i, &t_src, t.get_cache_hint());
     }
 
     alloc_delta.alloc_blk_ranges.emplace_back(
@@ -1759,7 +1759,7 @@ void Cache::complete_commit(
     add_extent(i);
     assert(!i->is_dirty());
     const auto t_src = t.get_src();
-    touch_extent(*i, &t_src);
+    touch_extent(*i, &t_src, t.get_cache_hint());
     epm.commit_space_used(i->get_paddr(), i->get_length());
 
     // Note: commit extents and backref allocations in the same place
@@ -2026,7 +2026,7 @@ Cache::replay_delta(
         [](CachedExtent &) {},
         [this](CachedExtent &ext) {
           // replay is not included by the cache hit metrics
-          touch_extent(ext, nullptr);
+          touch_extent(ext, nullptr, CACHE_HINT_TOUCH);
         },
         nullptr) :
       _get_extent_if_cached(
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index b2248ff37dd..a239b861726 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -124,6 +124,7 @@ public:
   TransactionRef create_transaction(
       Transaction::src_t src,
       const char* name,
+      cache_hint_t cache_hint,
       bool is_weak) {
     LOG_PREFIX(Cache::create_transaction);
 
@@ -137,7 +138,8 @@ public:
       [this](Transaction& t) {
         return on_transaction_destruct(t);
       },
-      ++next_id
+      ++next_id,
+      cache_hint
     );
     SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}",
               *ret, name, src, is_weak);
@@ -284,7 +286,7 @@ public:
     SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
               t, type, offset, *ret);
     t.add_to_read_set(ret);
-    touch_extent(*ret, &t_src);
+    touch_extent(*ret, &t_src, t.get_cache_hint());
     return ret->wait_io().then([ret] {
       return get_extent_if_cached_iertr::make_ready_future<
         CachedExtentRef>(ret);
@@ -341,7 +343,7 @@ public:
                 t, T::TYPE, offset, length);
       auto f = [&t, this, t_src](CachedExtent &ext) {
         t.add_to_read_set(CachedExtentRef(&ext));
-        touch_extent(ext, &t_src);
+        touch_extent(ext, &t_src, t.get_cache_hint());
       };
       return trans_intr::make_interruptible(
         do_get_caching_extent<T>(
@@ -389,7 +391,7 @@ public:
       ++stats.access.s.load_absent;
 
       t.add_to_read_set(CachedExtentRef(&ext));
-      touch_extent(ext, &t_src);
+      touch_extent(ext, &t_src, t.get_cache_hint());
     };
     return trans_intr::make_interruptible(
       do_get_caching_extent<T>(
@@ -487,7 +489,7 @@ public:
             ++access_stats.cache_lru;
             ++stats.access.s.cache_lru;
           }
-          touch_extent(*p_extent, &t_src);
+          touch_extent(*p_extent, &t_src, t.get_cache_hint());
         } else {
           if (p_extent->is_dirty()) {
             ++access_stats.trans_dirty;
@@ -834,7 +836,7 @@ private:
                 t, type, offset, length, laddr);
       auto f = [&t, this, t_src](CachedExtent &ext) {
 	t.add_to_read_set(CachedExtentRef(&ext));
-	touch_extent(ext, &t_src);
+	touch_extent(ext, &t_src, t.get_cache_hint());
       };
       return trans_intr::make_interruptible(
 	do_get_caching_extent_by_type(
@@ -876,7 +878,7 @@ private:
       ++stats.access.s.load_absent;
 
       t.add_to_read_set(CachedExtentRef(&ext));
-      touch_extent(ext, &t_src);
+      touch_extent(ext, &t_src, t.get_cache_hint());
     };
     return trans_intr::make_interruptible(
       do_get_caching_extent_by_type(
@@ -1472,11 +1474,10 @@ private:
   /// Update lru for access to ref
   void touch_extent(
       CachedExtent &ext,
-      const Transaction::src_t* p_src)
+      const Transaction::src_t* p_src,
+      cache_hint_t hint)
   {
-    if (p_src &&
-	is_background_transaction(*p_src) &&
-	is_logical_type(ext.get_type())) {
+    if (hint == CACHE_HINT_NOCACHE && is_logical_type(ext.get_type())) {
       return;
     }
     if (ext.is_stable_clean() && !ext.is_placeholder()) {
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 5b51083f344..6a866cb1f9b 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -408,6 +408,7 @@ SeaStore::Shard::mkfs_managers()
       return transaction_manager->with_transaction_intr(
 	Transaction::src_t::MUTATE,
 	"mkfs_seastore",
+	CACHE_HINT_TOUCH,
 	[this](auto& t)
       {
         LOG_PREFIX(SeaStoreS::mkfs_managers);
@@ -897,9 +898,10 @@ get_ranges(CollectionRef ch,
 
 seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
 SeaStore::Shard::list_objects(CollectionRef ch,
-                       const ghobject_t& start,
-                       const ghobject_t& end,
-                       uint64_t limit) const
+			      const ghobject_t& start,
+			      const ghobject_t& end,
+			      uint64_t limit,
+			      uint32_t op_flags) const
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -910,13 +912,14 @@ SeaStore::Shard::list_objects(CollectionRef ch,
   return seastar::do_with(
     RetType(std::vector<ghobject_t>(), start),
     std::move(limit),
-    [this, ch, start, end](auto& ret, auto& limit) {
-    return repeat_eagain([this, ch, start, end, &limit, &ret] {
+    [this, ch, start, end, op_flags](auto& ret, auto& limit) {
+    return repeat_eagain([this, ch, start, end, &limit, &ret, op_flags] {
       ++(shard_stats.repeat_read_num);
 
       return transaction_manager->with_transaction_intr(
         Transaction::src_t::READ,
         "list_objects",
+	op_flags,
         [this, ch, start, end, &limit, &ret](auto &t)
       {
         LOG_PREFIX(SeaStoreS::list_objects);
@@ -1054,6 +1057,7 @@ SeaStore::Shard::list_collections()
         return transaction_manager->with_transaction_intr(
           Transaction::src_t::READ,
           "list_collections",
+	  CACHE_HINT_TOUCH,
           [this, &ret](auto& t)
         {
           LOG_PREFIX(SeaStoreS::list_collections);
@@ -1137,6 +1141,7 @@ SeaStore::Shard::read(
     Transaction::src_t::READ,
     "read",
     op_type_t::READ,
+    op_flags,
     [this, offset, len, op_flags](auto &t, auto &onode) {
     return _read(t, onode, offset, len, op_flags);
   }).finally([this] {
@@ -1148,7 +1153,8 @@ SeaStore::Shard::read(
 SeaStore::Shard::base_errorator::future<bool>
 SeaStore::Shard::exists(
   CollectionRef c,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
   LOG_PREFIX(SeaStoreS::exists);
   ++(shard_stats.read_num);
@@ -1160,6 +1166,7 @@ SeaStore::Shard::exists(
     Transaction::src_t::READ,
     "exists",
     op_type_t::READ,
+    op_flags,
     [FNAME](auto& t, auto&) {
     DEBUGT("exists", t);
     return seastar::make_ready_future<bool>(true);
@@ -1240,7 +1247,8 @@ SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 SeaStore::Shard::get_attr(
   CollectionRef ch,
   const ghobject_t& oid,
-  std::string_view name) const
+  std::string_view name,
+  uint32_t op_flags) const
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -1251,6 +1259,7 @@ SeaStore::Shard::get_attr(
     Transaction::src_t::READ,
     "get_attr",
     op_type_t::GET_ATTR,
+    op_flags,
     [this, name](auto &t, auto& onode) {
     return _get_attr(t, onode, name);
   }).handle_error(
@@ -1296,7 +1305,8 @@ SeaStore::Shard::_get_attrs(
 SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
 SeaStore::Shard::get_attrs(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -1307,6 +1317,7 @@ SeaStore::Shard::get_attrs(
     Transaction::src_t::READ,
     "get_attrs",
     op_type_t::GET_ATTRS,
+    op_flags,
     [this](auto &t, auto& onode) {
     return _get_attrs(t, onode);
   }).handle_error(
@@ -1338,7 +1349,8 @@ seastar::future<struct stat> SeaStore::Shard::_stat(
 
 seastar::future<struct stat> SeaStore::Shard::stat(
   CollectionRef c,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -1349,6 +1361,7 @@ seastar::future<struct stat> SeaStore::Shard::stat(
     Transaction::src_t::READ,
     "stat",
     op_type_t::STAT,
+    op_flags,
     [this, oid](auto &t, auto &onode) {
     return _stat(t, onode, oid);
   }).handle_error(
@@ -1364,9 +1377,10 @@ seastar::future<struct stat> SeaStore::Shard::stat(
 SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
 SeaStore::Shard::omap_get_header(
   CollectionRef ch,
-  const ghobject_t& oid)
+  const ghobject_t& oid,
+  uint32_t op_flags)
 {
-  return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
+  return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY, op_flags);
 }
 
 SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
@@ -1389,7 +1403,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const omap_keys_t &keys)
+  const omap_keys_t &keys,
+  uint32_t op_flags)
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -1400,6 +1415,7 @@ SeaStore::Shard::omap_get_values(
     Transaction::src_t::READ,
     "omap_get_values",
     op_type_t::OMAP_GET_VALUES,
+    op_flags,
     [this, keys](auto &t, auto &onode) {
     return do_omap_get_values(t, onode, keys);
   }).finally([this] {
@@ -1529,7 +1545,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
 SeaStore::Shard::omap_get_values(
   CollectionRef ch,
   const ghobject_t &oid,
-  const std::optional<std::string> &start)
+  const std::optional<std::string> &start,
+  uint32_t op_flags)
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -1540,6 +1557,7 @@ SeaStore::Shard::omap_get_values(
     Transaction::src_t::READ,
     "omap_get_values2",
     op_type_t::OMAP_GET_VALUES2,
+    op_flags,
     [this, start](auto &t, auto &onode) {
     return do_omap_get_values(t, onode, start);
   }).finally([this] {
@@ -1589,7 +1607,8 @@ SeaStore::Shard::fiemap(
   CollectionRef ch,
   const ghobject_t& oid,
   uint64_t off,
-  uint64_t len)
+  uint64_t len,
+  uint32_t op_flags)
 {
   ++(shard_stats.read_num);
   ++(shard_stats.pending_read_num);
@@ -1600,6 +1619,7 @@ SeaStore::Shard::fiemap(
     Transaction::src_t::READ,
     "fiemap",
     op_type_t::READ,
+    op_flags,
     [this, off, len](auto &t, auto &onode) {
     return _fiemap(t, onode, off, len);
   }).finally([this] {
@@ -2677,6 +2697,7 @@ seastar::future<> SeaStore::Shard::write_meta(
     return transaction_manager->with_transaction_intr(
       Transaction::src_t::MUTATE,
       "write_meta",
+      CACHE_HINT_NOCACHE,
       [this, &key, &value](auto& t)
     {
       LOG_PREFIX(SeaStoreS::write_meta);
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index fd7e177da63..e2a993b9e20 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -101,7 +101,8 @@ public:
 
     seastar::future<struct stat> stat(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     read_errorator::future<ceph::bufferlist> read(
       CollectionRef c,
@@ -118,32 +119,38 @@ public:
 
     base_errorator::future<bool> exists(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     get_attr_errorator::future<ceph::bufferlist> get_attr(
       CollectionRef c,
       const ghobject_t& oid,
-      std::string_view name) const final;
+      std::string_view name,
+      uint32_t op_flags = 0) const final;
 
     get_attrs_ertr::future<attrs_t> get_attrs(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     read_errorator::future<omap_values_t> omap_get_values(
       CollectionRef c,
       const ghobject_t& oid,
-      const omap_keys_t& keys) final;
+      const omap_keys_t& keys,
+      uint32_t op_flags = 0) final;
 
     /// Retrieves paged set of values > start (if present)
     read_errorator::future<omap_values_paged_t> omap_get_values(
       CollectionRef c,           ///< [in] collection
       const ghobject_t &oid,     ///< [in] oid
-      const std::optional<std::string> &start ///< [in] start, empty for begin
+      const std::optional<std::string> &start, ///< [in] start, empty for begin
+      uint32_t op_flags = 0
       ) final; ///< @return <done, values> values.empty() iff done
 
     get_attr_errorator::future<bufferlist> omap_get_header(
       CollectionRef c,
-      const ghobject_t& oid) final;
+      const ghobject_t& oid,
+      uint32_t op_flags = 0) final;
 
     /// std::get<1>(ret) returns end if and only if the listing has listed all
     /// the items within the range, otherwise it returns the next key to be listed.
@@ -151,7 +158,8 @@ public:
       CollectionRef c,
       const ghobject_t& start,
       const ghobject_t& end,
-      uint64_t limit) const final;
+      uint64_t limit,
+      uint32_t op_flags = 0) const final;
 
     seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
     seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
@@ -170,7 +178,8 @@ public:
       CollectionRef ch,
       const ghobject_t& oid,
       uint64_t off,
-      uint64_t len) final;
+      uint64_t len,
+      uint32_t op_flags = 0) final;
 
     unsigned get_max_attr_name_length() const final {
       return 256;
@@ -251,7 +260,8 @@ public:
       return seastar::do_with(
         internal_context_t(
           ch, std::move(t),
-          transaction_manager->create_transaction(src, tname)),
+          transaction_manager->create_transaction(
+	    src, tname, t.get_fadvise_flags())),
         std::forward<F>(f),
         [this, op_type](auto &ctx, auto &f) {
         assert(shard_stats.starting_io_num);
@@ -298,20 +308,22 @@ public:
       Transaction::src_t src,
       const char* tname,
       op_type_t op_type,
+      cache_hint_t cache_hint_flags,
       F &&f) const {
       auto begin_time = std::chrono::steady_clock::now();
       return seastar::do_with(
         oid, Ret{}, std::forward<F>(f),
-        [this, ch, src, op_type, begin_time, tname
+        [this, ch, src, op_type, begin_time, tname, cache_hint_flags
         ](auto &oid, auto &ret, auto &f)
       {
-        return repeat_eagain([&, this, ch, src, tname] {
+        return repeat_eagain([&, this, ch, src, tname, cache_hint_flags] {
           assert(src == Transaction::src_t::READ);
           ++(shard_stats.repeat_read_num);
 
           return transaction_manager->with_transaction_intr(
             src,
             tname,
+	    cache_hint_flags,
             [&, this, ch, tname](auto& t)
           {
             LOG_PREFIX(SeaStoreS::repeat_with_onode);
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index 335a439dcb5..5930469ca07 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -20,9 +20,42 @@
 #include "include/intarith.h"
 #include "include/interval_set.h"
 #include "include/uuid.h"
+#include "include/rados.h"
 
 namespace crimson::os::seastore {
 
+class cache_hint_t {
+  enum hint_t {
+    TOUCH,
+    NOCACHE
+  };
+public:
+  static constexpr cache_hint_t get_touch() {
+    return hint_t::TOUCH;
+  }
+  static constexpr cache_hint_t get_nocache() {
+    return hint_t::NOCACHE;
+  }
+  cache_hint_t(uint32_t flags) {
+    if (unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) ||
+	unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) {
+      hint = NOCACHE;
+    }
+  }
+  bool operator==(const cache_hint_t &other) const {
+    return hint == other.hint;
+  }
+  bool operator!=(const cache_hint_t &other) const {
+    return hint != other.hint;
+  }
+private:
+  constexpr cache_hint_t(hint_t hint) : hint(hint) {}
+  hint_t hint = hint_t::TOUCH;
+};
+
+inline constexpr cache_hint_t CACHE_HINT_TOUCH = cache_hint_t::get_touch();
+inline constexpr cache_hint_t CACHE_HINT_NOCACHE = cache_hint_t::get_nocache();
+
 /* using a special xattr key "omap_header" to store omap header */
   const std::string OMAP_HEADER_XATTR_KEY = "omap_header";
 
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 66a9f896520..cd8c333c69f 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -409,12 +409,14 @@ public:
     src_t src,
     journal_seq_t initiated_after,
     on_destruct_func_t&& f,
-    transaction_id_t trans_id
+    transaction_id_t trans_id,
+    cache_hint_t cache_hint
   ) : weak(weak),
       handle(std::move(handle)),
       on_destruct(std::move(f)),
       src(src),
-      trans_id(trans_id)
+      trans_id(trans_id),
+      cache_hint(cache_hint)
   {}
 
   void invalidate_clear_write_set() {
@@ -573,6 +575,10 @@ public:
     return pre_alloc_list;
   }
 
+  cache_hint_t get_cache_hint() const {
+    return cache_hint;
+  }
+
 private:
   friend class Cache;
   friend Ref make_test_transaction();
@@ -682,6 +688,8 @@ private:
   seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
 
   backref_entry_refs_t backref_entries;
+
+  cache_hint_t cache_hint = CACHE_HINT_TOUCH;
 };
 using TransactionRef = Transaction::Ref;
 
@@ -694,7 +702,8 @@ inline TransactionRef make_test_transaction() {
     Transaction::src_t::MUTATE,
     JOURNAL_SEQ_NULL,
     [](Transaction&) {},
-    ++next_id
+    ++next_id,
+    CACHE_HINT_TOUCH
   );
 }
 
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index 753bd5d6ff6..807d88b2cbc 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -66,6 +66,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
     return with_transaction_intr(
       Transaction::src_t::MUTATE,
       "mkfs_tm",
+      CACHE_HINT_TOUCH,
       [this, FNAME](auto& t)
     {
       cache->init();
@@ -131,6 +132,7 @@ TransactionManager::mount()
     journal->get_trimmer().set_journal_head(start_seq);
     return with_transaction_weak(
       "mount",
+      CACHE_HINT_TOUCH,
       [this](auto &t)
     {
       return cache->init_cached_extents(t, [this](auto &t, auto &e) {
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index dc6cc20cf59..e574460894a 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -741,8 +741,9 @@ public:
   TransactionRef create_transaction(
       Transaction::src_t src,
       const char* name,
+      cache_hint_t cache_hint = CACHE_HINT_TOUCH,
       bool is_weak=false) final {
-    return cache->create_transaction(src, name, is_weak);
+    return cache->create_transaction(src, name, cache_hint, is_weak);
   }
 
   using ExtentCallbackInterface::submit_transaction_direct_ret;
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 64544d4c870..ce649303d4f 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -82,6 +82,9 @@ struct PGFacade final : BackfillState::PGFacade {
   }
 
   PGFacade(PG& pg) : pg(pg) {}
+  std::ostream &print(std::ostream &out) const override {
+    return out << pg;
+  }
 };
 
 } // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 837fd2eb2af..f957f072c93 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -8,11 +8,7 @@
 #include "crimson/osd/backfill_state.h"
 #include "osd/osd_types_fmt.h"
 
-namespace {
-  seastar::logger& logger() {
-    return crimson::get_logger(ceph_subsys_osd);
-  }
-}
+SET_SUBSYS(osd);
 
 namespace crimson::osd {
 
@@ -27,22 +23,23 @@ BackfillState::BackfillState(
     progress_tracker(
       std::make_unique<BackfillState::ProgressTracker>(backfill_machine))
 {
-  logger().debug("{}:{}", __func__, __LINE__);
+  LOG_PREFIX(BackfillState::BackfillState);
+  DEBUGDPP("", *backfill_machine.pg);
   backfill_machine.initiate();
 }
 
 template <class S>
 BackfillState::StateHelper<S>::StateHelper()
 {
-  logger().debug("enter {}",
-		 boost::typeindex::type_id<S>().pretty_name());
+  LOG_PREFIX(BackfillState::StateHelper);
+  DEBUGDPP("enter {}", pg(), boost::typeindex::type_id<S>().pretty_name());
 }
 
 template <class S>
 BackfillState::StateHelper<S>::~StateHelper()
 {
-  logger().debug("exit {}",
-		 boost::typeindex::type_id<S>().pretty_name());
+  LOG_PREFIX(BackfillState::StateHelper);
+  DEBUG("exit {}", boost::typeindex::type_id<S>().pretty_name());
 }
 
 BackfillState::~BackfillState() = default;
@@ -63,13 +60,16 @@ BackfillState::BackfillMachine::~BackfillMachine() = default;
 BackfillState::Initial::Initial(my_context ctx)
   : my_base(ctx)
 {
+  LOG_PREFIX(BackfillState::Initial::Initial);
   backfill_state().last_backfill_started = peering_state().earliest_backfill();
-  logger().debug("{}: bft={} from {}",
-                 __func__, peering_state().get_backfill_targets(),
-                 backfill_state().last_backfill_started);
+    DEBUGDPP("{}: bft={} from {}",
+      pg(),
+      __func__,
+      peering_state().get_backfill_targets(),
+      backfill_state().last_backfill_started);
   for (const auto& bt : peering_state().get_backfill_targets()) {
-    logger().debug("{}: target shard {} from {}",
-                   __func__, bt, peering_state().get_peer_last_backfill(bt));
+    DEBUGDPP("{}: target shard {} from {}",
+      pg(), __func__, bt, peering_state().get_peer_last_backfill(bt));
   }
   ceph_assert(peering_state().get_backfill_targets().size());
   ceph_assert(!backfill_state().last_backfill_started.is_max());
@@ -80,7 +80,8 @@ BackfillState::Initial::Initial(my_context ctx)
 boost::statechart::result
 BackfillState::Initial::react(const BackfillState::Triggered& evt)
 {
-  logger().debug("{}: backfill triggered", __func__);
+  LOG_PREFIX(BackfillState::Initial::react::Triggered);
+  DEBUGDPP("", pg());
   ceph_assert(backfill_state().last_backfill_started == \
               peering_state().earliest_backfill());
   ceph_assert(peering_state().is_backfilling());
@@ -93,26 +94,10 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt)
   if (Enqueuing::all_enqueued(peering_state(),
                               backfill_state().backfill_info,
                               backfill_state().peer_backfill_info)) {
-    logger().debug("{}: switching to Done state", __func__);
+    DEBUGDPP("switching to Done state", pg());
     return transit<BackfillState::Done>();
   } else {
-    logger().debug("{}: switching to Enqueuing state", __func__);
-    return transit<BackfillState::Enqueuing>();
-  }
-}
-
-boost::statechart::result
-BackfillState::Cancelled::react(const BackfillState::Triggered& evt)
-{
-  logger().debug("{}: backfill re-triggered", __func__);
-  ceph_assert(peering_state().is_backfilling());
-  if (Enqueuing::all_enqueued(peering_state(),
-                              backfill_state().backfill_info,
-                              backfill_state().peer_backfill_info)) {
-    logger().debug("{}: switching to Done state", __func__);
-    return transit<BackfillState::Done>();
-  } else {
-    logger().debug("{}: switching to Enqueuing state", __func__);
+    DEBUGDPP("switching to Enqueuing state", pg());
     return transit<BackfillState::Enqueuing>();
   }
 }
@@ -120,9 +105,10 @@ BackfillState::Cancelled::react(const BackfillState::Triggered& evt)
 // -- Enqueuing
 void BackfillState::Enqueuing::maybe_update_range()
 {
+  LOG_PREFIX(BackfillState::Enqueuing::maybe_update_range);
   if (auto& primary_bi = backfill_state().backfill_info;
       primary_bi.version >= pg().get_projected_last_update()) {
-    logger().info("{}: bi is current", __func__);
+    INFODPP("bi is current", pg());
     ceph_assert(primary_bi.version == pg().get_projected_last_update());
   } else if (primary_bi.version >= peering_state().get_log_tail()) {
     if (peering_state().get_pg_log().get_log().empty() &&
@@ -136,31 +122,31 @@ void BackfillState::Enqueuing::maybe_update_range()
       ceph_assert(primary_bi.version == eversion_t());
       return;
     }
-    logger().debug("{}: bi is old, ({}) can be updated with log to {}",
-                   __func__,
-                   primary_bi.version,
-                   pg().get_projected_last_update());
+    DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}",
+	     pg(),
+	     primary_bi.version,
+	     pg().get_projected_last_update());
     auto func =
       [&](const pg_log_entry_t& e) {
-        logger().debug("maybe_update_range(lambda): updating from version {}",
-                       e.version);
+        DEBUGDPP("maybe_update_range(lambda): updating from version {}",
+	  pg(), e.version);
         if (e.soid >= primary_bi.begin && e.soid <  primary_bi.end) {
 	  if (e.is_update()) {
-	    logger().debug("maybe_update_range(lambda): {} updated to ver {}",
-                           e.soid, e.version);
+	    DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}",
+	      pg(), e.soid, e.version);
             primary_bi.objects.erase(e.soid);
             primary_bi.objects.insert(std::make_pair(e.soid,
                                                              e.version));
 	  } else if (e.is_delete()) {
-            logger().debug("maybe_update_range(lambda): {} removed",
-                           e.soid);
+            DEBUGDPP("maybe_update_range(lambda): {} removed",
+	      pg(), e.soid);
             primary_bi.objects.erase(e.soid);
           }
         }
       };
-    logger().debug("{}: scanning pg log first", __func__);
+    DEBUGDPP("{}: scanning pg log first", pg());
     peering_state().scan_log_after(primary_bi.version, func);
-    logger().debug("{}: scanning projected log", __func__);
+    DEBUGDPP("{}: scanning projected log", pg());
     pg().get_projected_log().scan_log_after(primary_bi.version, func);
     primary_bi.version = pg().get_projected_last_update();
   } else {
@@ -244,6 +230,7 @@ void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
 BackfillState::Enqueuing::result_t
 BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
 {
+  LOG_PREFIX(BackfillState::Enqueuing::remove_on_peers);
   // set `new_last_backfill_started` to `check`
   result_t result { {}, check };
   for (const auto& bt : peering_state().get_backfill_targets()) {
@@ -255,8 +242,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
       backfill_listener().enqueue_drop(bt, pbi.begin, version);
     }
   }
-  logger().debug("{}: BACKFILL removing {} from peers {}",
-                 __func__, check, result.pbi_targets);
+  DEBUGDPP("BACKFILL removing {} from peers {}",
+	   pg(), check, result.pbi_targets);
   ceph_assert(!result.pbi_targets.empty());
   return result;
 }
@@ -264,7 +251,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
 BackfillState::Enqueuing::result_t
 BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
 {
-  logger().debug("{}: check={}", __func__, check);
+  LOG_PREFIX(BackfillState::Enqueuing::update_on_peers);
+  DEBUGDPP("check={}", pg(), check);
   const auto& primary_bi = backfill_state().backfill_info;
   result_t result { {}, primary_bi.begin };
   std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
@@ -325,6 +313,7 @@ bool BackfillState::Enqueuing::Enqueuing::all_emptied(
 BackfillState::Enqueuing::Enqueuing(my_context ctx)
   : my_base(ctx)
 {
+  LOG_PREFIX(BackfillState::Enqueuing::Enqueuing);
   auto& primary_bi = backfill_state().backfill_info;
 
   // update our local interval to cope with recent changes
@@ -334,8 +323,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
     // that backfill will be spinning here over and over. For the sake
     // of performance and complexity we don't synchronize with entire PG.
     // similar can happen in classical OSD.
-    logger().warn("{}: bi is old, rescanning of local backfill_info",
-                  __func__);
+    WARNDPP("bi is old, rescanning of local backfill_info", pg());
     post_event(RequestPrimaryScanning{});
     return;
   } else {
@@ -347,13 +335,14 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
 				   primary_bi)) {
     // need to grab one another chunk of the object namespace and restart
     // the queueing.
-    logger().debug("{}: reached end for current local chunk", __func__);
+    DEBUGDPP("reached end for current local chunk", pg());
     post_event(RequestPrimaryScanning{});
     return;
   }
 
   do {
     if (!backfill_listener().budget_available()) {
+      DEBUGDPP("throttle failed, turning to Waiting", pg());
       post_event(RequestWaiting{});
       return;
     } else if (should_rescan_replicas(backfill_state().peer_backfill_info,
@@ -392,16 +381,25 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
     }
   } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
 
-  if (backfill_state().progress_tracker->tracked_objects_completed()
-      && Enqueuing::all_enqueued(peering_state(),
-				 backfill_state().backfill_info,
-				 backfill_state().peer_backfill_info)) {
-    backfill_state().last_backfill_started = hobject_t::get_max();
-    backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+  if (should_rescan_primary(backfill_state().peer_backfill_info,
+				   primary_bi)) {
+    // need to grab one another chunk of the object namespace and restart
+    // the queueing.
+    DEBUGDPP("reached end for current local chunk", pg());
+    post_event(RequestPrimaryScanning{});
+    return;
+  } else {
+    if (backfill_state().progress_tracker->tracked_objects_completed()
+	&& Enqueuing::all_enqueued(peering_state(),
+				   backfill_state().backfill_info,
+				   backfill_state().peer_backfill_info)) {
+      backfill_state().last_backfill_started = hobject_t::get_max();
+      backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+    }
+    DEBUGDPP("reached end for both local and all peers "
+	     "but still has in-flight operations", pg());
+    post_event(RequestWaiting{});
   }
-  logger().debug("{}: reached end for both local and all peers "
-		 "but still has in-flight operations", __func__);
-  post_event(RequestWaiting{});
 }
 
 // -- PrimaryScanning
@@ -416,16 +414,45 @@ BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx)
 boost::statechart::result
 BackfillState::PrimaryScanning::react(PrimaryScanned evt)
 {
-  logger().debug("{}", __func__);
+  LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned);
+  DEBUGDPP("", pg());
   backfill_state().backfill_info = std::move(evt.result);
-  return transit<Enqueuing>();
+  if (!backfill_state().is_suspended()) {
+    return transit<Enqueuing>();
+  } else {
+    DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+    backfill_state().go_enqueuing_on_resume();
+  }
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(CancelBackfill evt)
+{
+  LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill);
+  DEBUGDPP("suspended within PrimaryScanning", pg());
+  backfill_state().on_suspended();
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(Triggered evt)
+{
+  LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered);
+  ceph_assert(backfill_state().is_suspended());
+  if (backfill_state().on_resumed()) {
+    DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+    return transit<Enqueuing>();
+  }
+  return discard_event();
 }
 
 boost::statechart::result
 BackfillState::PrimaryScanning::react(ObjectPushed evt)
 {
-  logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
-                 evt.object);
+  LOG_PREFIX(BackfillState::PrimaryScanning::react::ObjectPushed);
+  DEBUGDPP("PrimaryScanning::react() on ObjectPushed; evt.object={}",
+    pg(), evt.object);
   backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
@@ -443,11 +470,11 @@ bool BackfillState::ReplicasScanning::replica_needs_scan(
 BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx)
   : my_base(ctx)
 {
+  LOG_PREFIX(BackfillState::ReplicasScanning::ReplicasScanning);
   for (const auto& bt : peering_state().get_backfill_targets()) {
     if (const auto& pbi = backfill_state().peer_backfill_info.at(bt);
         replica_needs_scan(pbi, backfill_state().backfill_info)) {
-      logger().debug("{}: scanning peer osd.{} from {}",
-                     __func__, bt, pbi.end);
+	DEBUGDPP("scanning peer osd.{} from {}", pg(), bt, pbi.end);
       backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{});
 
       ceph_assert(waiting_on_backfill.find(bt) == \
@@ -469,8 +496,9 @@ BackfillState::ReplicasScanning::~ReplicasScanning()
 boost::statechart::result
 BackfillState::ReplicasScanning::react(ReplicaScanned evt)
 {
-  logger().debug("{}: got scan result from osd={}, result={}",
-                 __func__, evt.from, evt.result);
+  LOG_PREFIX(BackfillState::ReplicasScanning::react::ReplicaScanned);
+  DEBUGDPP("got scan result from osd={}, result={}",
+    pg(), evt.from, evt.result);
   // TODO: maybe we'll be able to move waiting_on_backfill from
   // the machine to the state.
   ceph_assert(peering_state().is_backfill_target(evt.from));
@@ -479,12 +507,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
     if (waiting_on_backfill.empty()) {
       ceph_assert(backfill_state().peer_backfill_info.size() == \
                   peering_state().get_backfill_targets().size());
-      return transit<Enqueuing>();
+      if (!backfill_state().is_suspended()) {
+	return transit<Enqueuing>();
+      } else {
+	DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+	backfill_state().go_enqueuing_on_resume();
+      }
     }
   } else {
-    // we canceled backfill for a while due to a too full, and this
+    // we suspended backfill for a while due to a too full, and this
     // is an extra response from a non-too-full peer
-    logger().debug("{}: canceled backfill (too full?)", __func__);
+    DEBUGDPP("suspended backfill (too full?)", pg());
   }
   return discard_event();
 }
@@ -492,17 +525,30 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
 boost::statechart::result
 BackfillState::ReplicasScanning::react(CancelBackfill evt)
 {
-  logger().debug("{}: cancelled within ReplicasScanning",
-                 __func__);
-  waiting_on_backfill.clear();
-  return transit<Cancelled>();
+  LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill);
+  DEBUGDPP("suspended within ReplicasScanning", pg());
+  backfill_state().on_suspended();
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(Triggered evt)
+{
+  LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered);
+  ceph_assert(backfill_state().is_suspended());
+  if (backfill_state().on_resumed()) {
+    DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+    return transit<Enqueuing>();
+  }
+  return discard_event();
 }
 
 boost::statechart::result
 BackfillState::ReplicasScanning::react(ObjectPushed evt)
 {
-  logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
-                 evt.object);
+  LOG_PREFIX(BackfillState::ReplicasScanning::react::ObjectPushed);
+  DEBUGDPP("ReplicasScanning::react() on ObjectPushed; evt.object={}",
+    pg(), evt.object);
   backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
   return discard_event();
 }
@@ -517,17 +563,45 @@ BackfillState::Waiting::Waiting(my_context ctx)
 boost::statechart::result
 BackfillState::Waiting::react(ObjectPushed evt)
 {
-  logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
-                 evt.object);
+  LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed);
+  DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object);
   backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
-  return transit<Enqueuing>();;
+  if (!backfill_state().is_suspended()) {
+    return transit<Enqueuing>();
+  } else {
+    DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+    backfill_state().go_enqueuing_on_resume();
+  }
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(CancelBackfill evt)
+{
+  LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill);
+  DEBUGDPP("suspended within Waiting", pg());
+  backfill_state().on_suspended();
+  return discard_event();
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(Triggered evt)
+{
+  LOG_PREFIX(BackfillState::Waiting::react::Triggered);
+  ceph_assert(backfill_state().is_suspended());
+  if (backfill_state().on_resumed()) {
+    DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+    return transit<Enqueuing>();
+  }
+  return discard_event();
 }
 
 // -- Done
 BackfillState::Done::Done(my_context ctx)
   : my_base(ctx)
 {
-  logger().info("{}: backfill is done", __func__);
+  LOG_PREFIX(BackfillState::Done::Done);
+  INFODPP("backfill is done", pg());
   backfill_listener().backfilled();
 }
 
@@ -537,13 +611,6 @@ BackfillState::Crashed::Crashed()
   ceph_abort_msg("{}: this should not happen");
 }
 
-// -- Cancelled
-BackfillState::Cancelled::Cancelled(my_context ctx)
-  : my_base(ctx)
-{
-  ceph_assert(peering_state().get_backfill_targets().size());
-}
-
 // ProgressTracker is an intermediary between the BackfillListener and
 // BackfillMachine + its states. All requests to push or drop an object
 // are directed through it. The same happens with notifications about
@@ -577,8 +644,8 @@ void BackfillState::ProgressTracker::complete_to(
   const pg_stat_t& stats,
   bool may_push_to_max)
 {
-  logger().debug("{}: obj={}",
-                 __func__, obj);
+  LOG_PREFIX(BackfillState::ProgressTracker::complete_to);
+  DEBUGDPP("obj={}", pg(), obj);
   if (auto completion_iter = registry.find(obj);
       completion_iter != std::end(registry)) {
     completion_iter->second = \
@@ -619,4 +686,19 @@ void BackfillState::enqueue_standalone_push(
   backfill_machine.backfill_listener.enqueue_push(obj, v, peers);
 }
 
+void BackfillState::enqueue_standalone_delete(
+  const hobject_t &obj,
+  const eversion_t &v,
+  const std::vector<pg_shard_t> &peers)
+{
+  progress_tracker->enqueue_drop(obj);
+  for (auto bt : peers) {
+    backfill_machine.backfill_listener.enqueue_drop(bt, obj, v);
+  }
+}
+
+std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) {
+  return pg.print(out);
+}
+
 } // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index 072c91e079d..517a02ea4df 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -62,6 +62,8 @@ struct BackfillState {
   struct CancelBackfill : sc::event<CancelBackfill> {
   };
 
+  struct ThrottleAcquired : sc::event<ThrottleAcquired> {
+  };
 private:
   // internal events
   struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> {
@@ -136,34 +138,10 @@ public:
     explicit Crashed();
   };
 
-  struct Cancelled : sc::state<Cancelled, BackfillMachine>,
-                     StateHelper<Cancelled> {
-    using reactions = boost::mpl::list<
-      sc::custom_reaction<Triggered>,
-      sc::custom_reaction<PrimaryScanned>,
-      sc::custom_reaction<ReplicaScanned>,
-      sc::custom_reaction<ObjectPushed>,
-      sc::transition<sc::event_base, Crashed>>;
-    explicit Cancelled(my_context);
-    // resume after triggering backfill by on_activate_complete().
-    // transit to Enqueuing.
-    sc::result react(const Triggered&);
-    sc::result react(const PrimaryScanned&) {
-      return discard_event();
-    }
-    sc::result react(const ReplicaScanned&) {
-      return discard_event();
-    }
-    sc::result react(const ObjectPushed&) {
-      return discard_event();
-    }
-  };
-
   struct Initial : sc::state<Initial, BackfillMachine>,
                    StateHelper<Initial> {
     using reactions = boost::mpl::list<
       sc::custom_reaction<Triggered>,
-      sc::transition<CancelBackfill, Cancelled>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Initial(my_context);
     // initialize after triggering backfill by on_activate_complete().
@@ -174,12 +152,9 @@ public:
   struct Enqueuing : sc::state<Enqueuing, BackfillMachine>,
                      StateHelper<Enqueuing> {
     using reactions = boost::mpl::list<
-      sc::transition<CancelBackfill, Cancelled>,
       sc::transition<RequestPrimaryScanning, PrimaryScanning>,
       sc::transition<RequestReplicasScanning, ReplicasScanning>,
       sc::transition<RequestWaiting, Waiting>,
-      sc::transition<RequestDone, Done>,
-      sc::transition<CancelBackfill, Cancelled>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Enqueuing(my_context);
 
@@ -237,12 +212,15 @@ public:
       sc::custom_reaction<ObjectPushed>,
       sc::custom_reaction<PrimaryScanned>,
       sc::transition<RequestDone, Done>,
-      sc::transition<CancelBackfill, Cancelled>,
+      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<Triggered>,
       sc::transition<sc::event_base, Crashed>>;
     explicit PrimaryScanning(my_context);
     sc::result react(ObjectPushed);
     // collect scanning result and transit to Enqueuing.
     sc::result react(PrimaryScanned);
+    sc::result react(CancelBackfill);
+    sc::result react(Triggered);
   };
 
   struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
@@ -251,6 +229,7 @@ public:
       sc::custom_reaction<ObjectPushed>,
       sc::custom_reaction<ReplicaScanned>,
       sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<Triggered>,
       sc::transition<RequestDone, Done>,
       sc::transition<sc::event_base, Crashed>>;
     explicit ReplicasScanning(my_context);
@@ -259,6 +238,7 @@ public:
     sc::result react(ObjectPushed);
     sc::result react(ReplicaScanned);
     sc::result react(CancelBackfill);
+    sc::result react(Triggered);
 
     // indicate whether a particular peer should be scanned to retrieve
     // BackfillInterval for new range of hobject_t namespace.
@@ -277,10 +257,14 @@ public:
     using reactions = boost::mpl::list<
       sc::custom_reaction<ObjectPushed>,
       sc::transition<RequestDone, Done>,
-      sc::transition<CancelBackfill, Cancelled>,
+      sc::custom_reaction<CancelBackfill>,
+      sc::custom_reaction<Triggered>,
+      sc::transition<ThrottleAcquired, Enqueuing>,
       sc::transition<sc::event_base, Crashed>>;
     explicit Waiting(my_context);
     sc::result react(ObjectPushed);
+    sc::result react(CancelBackfill);
+    sc::result react(Triggered);
   };
 
   struct Done : sc::state<Done, BackfillMachine>,
@@ -308,6 +292,11 @@ public:
     const hobject_t &obj,
     const eversion_t &v,
     const std::vector<pg_shard_t> &peers);
+  void enqueue_standalone_delete(
+    const hobject_t &obj,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers);
+
 
   bool is_triggered() const {
     return backfill_machine.triggering_event() != nullptr;
@@ -325,6 +314,26 @@ public:
     }
   }
 private:
+  struct backfill_suspend_state_t {
+    bool suspended = false;
+    bool should_go_enqueuing = false;
+  } backfill_suspend_state;
+  bool is_suspended() const {
+    return backfill_suspend_state.suspended;
+  }
+  void on_suspended() {
+    ceph_assert(!is_suspended());
+    backfill_suspend_state = {true, false};
+  }
+  bool on_resumed() {
+    auto go_enqueuing = backfill_suspend_state.should_go_enqueuing;
+    backfill_suspend_state = {false, false};
+    return go_enqueuing;
+  }
+  void go_enqueuing_on_resume() {
+    ceph_assert(is_suspended());
+    backfill_suspend_state.should_go_enqueuing = true;
+  }
   hobject_t last_backfill_started;
   BackfillInterval backfill_info;
   std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
@@ -405,8 +414,10 @@ struct BackfillState::PGFacade {
   virtual const eversion_t& get_projected_last_update() const = 0;
   virtual const PGLog::IndexedLog& get_projected_log() const = 0;
 
+  virtual std::ostream &print(std::ostream &out) const = 0;
   virtual ~PGFacade() {}
 };
+std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg);
 
 class BackfillState::ProgressTracker {
   // TODO: apply_stat,
@@ -433,6 +444,9 @@ class BackfillState::ProgressTracker {
   BackfillListener& backfill_listener() {
     return backfill_machine.backfill_listener;
   }
+  PGFacade& pg() {
+    return *backfill_machine.pg;
+  }
 
 public:
   ProgressTracker(BackfillMachine& backfill_machine)
@@ -447,3 +461,9 @@ public:
 };
 
 } // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::BackfillState::PGFacade>
+  : fmt::ostream_formatter {};
+#endif
+
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
index f8815a316d1..394375c1129 100644
--- a/src/crimson/osd/osd_operation.h
+++ b/src/crimson/osd/osd_operation.h
@@ -217,6 +217,9 @@ protected:
 
 public:
   static constexpr bool is_trackable = true;
+  virtual bool requires_pg() const {
+    return true;
+  }
 };
 
 template <class T>
@@ -338,6 +341,18 @@ public:
       with_throttle_while(std::forward<Args>(args)...), *this);
   }
 
+  // Returns std::nullopt if the throttle is acquired immediately,
+  // returns the future for the acquiring otherwise
+  std::optional<seastar::future<>>
+  try_acquire_throttle_now(crimson::osd::scheduler::params_t params) {
+    if (!max_in_progress || in_progress < max_in_progress) {
+      ++in_progress;
+      --pending;
+      return std::nullopt;
+    }
+    return acquire_throttle(params);
+  }
+
 private:
   void dump_detail(Formatter *f) const final;
 
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index 98443bdfc0f..91a6728fd4b 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -42,6 +42,10 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>,
   unsigned instance_id = 0;
 
 public:
+  epoch_t get_epoch_sent_at() const {
+    return m->get_map_epoch();
+  }
+
   /**
    * instance_handle_t
    *
diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h
index e12243ce430..fe4761c4ab4 100644
--- a/src/crimson/osd/osd_operations/logmissing_request.h
+++ b/src/crimson/osd/osd_operations/logmissing_request.h
@@ -36,6 +36,9 @@ public:
   }
   PipelineHandle &get_handle() { return handle; }
   epoch_t get_epoch() const { return req->get_min_epoch(); }
+  epoch_t get_epoch_sent_at() const {
+    return req->get_map_epoch();
+  }
 
   ConnectionPipeline &get_connection_pipeline();
 
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h
index 71651d16789..bdb6c2ac6ac 100644
--- a/src/crimson/osd/osd_operations/logmissing_request_reply.h
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h
@@ -36,6 +36,9 @@ public:
   }
   PipelineHandle &get_handle() { return handle; }
   epoch_t get_epoch() const { return req->get_min_epoch(); }
+  epoch_t get_epoch_sent_at() const {
+    return req->get_map_epoch();
+  }
 
   ConnectionPipeline &get_connection_pipeline();
 
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
index 85de5c711d6..aa6b8a95a94 100644
--- a/src/crimson/osd/osd_operations/peering_event.h
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -44,6 +44,10 @@ protected:
   float delay = 0;
   PGPeeringEvent evt;
 
+  epoch_t get_epoch_sent_at() const {
+    return evt.get_epoch_sent();
+  }
+
   const pg_shard_t get_from() const {
     return from;
   }
@@ -84,6 +88,10 @@ public:
     evt(std::forward<Args>(args)...)
   {}
 
+  bool requires_pg() const final {
+    return evt.requires_pg;
+  }
+
   void print(std::ostream &) const final;
   void dump_detail(ceph::Formatter* f) const final;
   seastar::future<> with_pg(
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h
index 43be7319545..21702f6ff4f 100644
--- a/src/crimson/osd/osd_operations/pg_advance_map.h
+++ b/src/crimson/osd/osd_operations/pg_advance_map.h
@@ -50,6 +50,10 @@ public:
     PGPeeringPipeline::Process::BlockingEvent
   > tracking_events;
 
+  epoch_t get_epoch_sent_at() const {
+    return to;
+  }
+
 private:
   PGPeeringPipeline &peering_pp(PG &pg);
 };
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h
index 17c2faf97ea..2fe8ff372b3 100644
--- a/src/crimson/osd/osd_operations/recovery_subrequest.h
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.h
@@ -39,6 +39,9 @@ public:
   }
   PipelineHandle &get_handle() { return handle; }
   epoch_t get_epoch() const { return m->get_min_epoch(); }
+  epoch_t get_epoch_sent_at() const {
+    return m->get_map_epoch();
+  }
 
   ConnectionPipeline &get_connection_pipeline();
 
diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h
index 37d2771a33b..c2494b3715f 100644
--- a/src/crimson/osd/osd_operations/replicated_request.h
+++ b/src/crimson/osd/osd_operations/replicated_request.h
@@ -36,6 +36,9 @@ public:
   }
   PipelineHandle &get_handle() { return handle; }
   epoch_t get_epoch() const { return req->get_min_epoch(); }
+  epoch_t get_epoch_sent_at() const {
+    return req->get_map_epoch();
+  }
 
   ConnectionPipeline &get_connection_pipeline();
 
diff --git a/src/crimson/osd/osd_operations/scrub_events.h b/src/crimson/osd/osd_operations/scrub_events.h
index 02a5d852bb7..8bed90e4c14 100644
--- a/src/crimson/osd/osd_operations/scrub_events.h
+++ b/src/crimson/osd/osd_operations/scrub_events.h
@@ -27,11 +27,11 @@ class RemoteScrubEventBaseT : public PhasedOperationT<T> {
   crimson::net::ConnectionRef l_conn;
   crimson::net::ConnectionXcoreRef r_conn;
 
-  epoch_t epoch;
   spg_t pgid;
 
 protected:
   using interruptor = InterruptibleOperation::interruptor;
+  epoch_t epoch;
 
   template <typename U=void>
   using ifut = InterruptibleOperation::interruptible_future<U>;
@@ -40,7 +40,7 @@ protected:
 public:
   RemoteScrubEventBaseT(
     crimson::net::ConnectionRef conn, epoch_t epoch, spg_t pgid)
-    : l_conn(std::move(conn)), epoch(epoch), pgid(pgid) {}
+    : l_conn(std::move(conn)), pgid(pgid), epoch(epoch) {}
 
   PGPeeringPipeline &get_peering_pipeline(PG &pg);
 
@@ -117,6 +117,10 @@ public:
     : RemoteScrubEventBaseT<ScrubRequested>(std::forward<Args>(base_args)...),
       deep(deep) {}
 
+  epoch_t get_epoch_sent_at() const {
+    return epoch;
+  }
+
   void print(std::ostream &out) const final {
     out << "(deep=" << deep << ")";
   }
@@ -141,6 +145,10 @@ public:
     ceph_assert(scrub::PGScrubber::is_scrub_message(*m));
   }
 
+  epoch_t get_epoch_sent_at() const {
+    return epoch;
+  }
+
   void print(std::ostream &out) const final {
     out << "(m=" << *m << ")";
   }
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index bf521498abf..2746e730f2b 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -879,6 +879,17 @@ void PG::enqueue_push_for_backfill(
   backfill_state->enqueue_standalone_push(obj, v, peers);
 }
 
+void PG::enqueue_delete_for_backfill(
+  const hobject_t &obj,
+  const eversion_t &v,
+  const std::vector<pg_shard_t> &peers)
+{
+  assert(recovery_handler);
+  assert(recovery_handler->backfill_state);
+  auto backfill_state = recovery_handler->backfill_state.get();
+  backfill_state->enqueue_standalone_delete(obj, v, peers);
+}
+
 PG::interruptible_future<
   std::tuple<PG::interruptible_future<>,
              PG::interruptible_future<>>>
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 6db73ee835b..06038c0aa00 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -904,6 +904,11 @@ private:
     const hobject_t &obj,
     const eversion_t &v,
     const std::vector<pg_shard_t> &peers);
+  void enqueue_delete_for_backfill(
+    const hobject_t &obj,
+    const eversion_t &v,
+    const std::vector<pg_shard_t> &peers);
+
   bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const;
   bool can_discard_op(const MOSDOp& m) const;
   void context_registry_on_change();
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
index a40b28caa8b..79895de06de 100644
--- a/src/crimson/osd/pg_backend.cc
+++ b/src/crimson/osd/pg_backend.cc
@@ -1325,9 +1325,10 @@ maybe_get_omap_vals(
 PGBackend::ll_read_ierrorator::future<ceph::bufferlist>
 PGBackend::omap_get_header(
   const crimson::os::CollectionRef& c,
-  const ghobject_t& oid) const
+  const ghobject_t& oid,
+  uint32_t op_flags) const
 {
-  return store->omap_get_header(c, oid)
+  return store->omap_get_header(c, oid, op_flags)
     .handle_error(
       crimson::ct_error::enodata::handle([] {
 	return seastar::make_ready_future<bufferlist>();
@@ -1340,10 +1341,13 @@ PGBackend::ll_read_ierrorator::future<>
 PGBackend::omap_get_header(
   const ObjectState& os,
   OSDOp& osd_op,
-  object_stat_sum_t& delta_stats) const
+  object_stat_sum_t& delta_stats,
+  uint32_t op_flags) const
 {
   if (os.oi.is_omap()) {
-    return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible(
+    return omap_get_header(
+      coll, ghobject_t{os.oi.soid}, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
+    ).safe_then_interruptible(
       [&delta_stats, &osd_op] (ceph::bufferlist&& header) {
         osd_op.outdata = std::move(header);
         delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
@@ -1707,7 +1711,8 @@ PGBackend::fiemap(
   CollectionRef c,
   const ghobject_t& oid,
   uint64_t off,
-  uint64_t len)
+  uint64_t len,
+  uint32_t op_flags)
 {
   return store->fiemap(c, oid, off, len);
 }
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
index c24176a10e7..9c2230375b0 100644
--- a/src/crimson/osd/pg_backend.h
+++ b/src/crimson/osd/pg_backend.h
@@ -315,7 +315,8 @@ public:
     CollectionRef c,
     const ghobject_t& oid,
     uint64_t off,
-    uint64_t len);
+    uint64_t len,
+    uint32_t op_flags = 0);
 
   write_iertr::future<> tmapput(
     ObjectState& os,
@@ -375,11 +376,13 @@ public:
     object_stat_sum_t& delta_stats);
   ll_read_ierrorator::future<ceph::bufferlist> omap_get_header(
     const crimson::os::CollectionRef& c,
-    const ghobject_t& oid) const;
+    const ghobject_t& oid,
+    uint32_t op_flags = 0) const;
   ll_read_ierrorator::future<> omap_get_header(
     const ObjectState& os,
     OSDOp& osd_op,
-    object_stat_sum_t& delta_stats) const;
+    object_stat_sum_t& delta_stats,
+    uint32_t op_flags = 0) const;
   interruptible_future<> omap_set_header(
     ObjectState& os,
     const OSDOp& osd_op,
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index ec3af0d2b00..5eef584c776 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -67,8 +67,6 @@ PGRecovery::start_recovery_ops(
   if (max_to_start > 0) {
     max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started);
   }
-  using interruptor =
-    crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
   return interruptor::parallel_for_each(started,
 					[] (auto&& ifut) {
     return std::move(ifut);
@@ -609,8 +607,21 @@ void PGRecovery::update_peers_last_backfill(
 
 bool PGRecovery::budget_available() const
 {
-  // TODO: the limits!
-  return true;
+  crimson::osd::scheduler::params_t params =
+    {1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort};
+  auto &ss = pg->get_shard_services();
+  auto futopt = ss.try_acquire_throttle_now(std::move(params));
+  if (!futopt) {
+    return true;
+  }
+  std::ignore = interruptor::make_interruptible(std::move(*futopt)
+  ).then_interruptible([this] {
+    assert(!backfill_state->is_triggered());
+    using BackfillState = crimson::osd::BackfillState;
+    backfill_state->process_event(
+      BackfillState::ThrottleAcquired{}.intrusive_from_this());
+  });
+  return false;
 }
 
 void PGRecovery::on_pg_clean()
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 657e6d3e888..5c7b5c5ef2b 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -25,6 +25,8 @@ class PGBackend;
 
 class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
 public:
+  using interruptor =
+    crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
   template <typename T = void>
   using interruptible_future = RecoveryBackend::interruptible_future<T>;
   PGRecovery(PGRecoveryListener* pg) : pg(pg) {}
diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h
index b9879c8c9dd..f7bd7a6c08e 100644
--- a/src/crimson/osd/pg_shard_manager.h
+++ b/src/crimson/osd/pg_shard_manager.h
@@ -256,18 +256,40 @@ public:
     auto &opref = *op;
     return opref.template with_blocking_event<
       PGMap::PGCreationBlockingEvent
-    >([&target_shard_services, &opref](auto &&trigger) {
-      return target_shard_services.wait_for_pg(
-        std::move(trigger), opref.get_pgid());
-    }).safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) {
-      logger.debug("{}: have_pg", opref);
-      return opref.with_pg(target_shard_services, pgref);
-    }).handle_error(
-      crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
-        logger.debug("{}: pg creation canceled, dropping", opref);
-        return seastar::now();
-      })
-    ).then([op=std::move(op)] {});
+    >([&target_shard_services, &opref, &logger](auto &&trigger) mutable {
+      auto pg = target_shard_services.get_pg(opref.get_pgid());
+      auto fut = ShardServices::wait_for_pg_ertr::make_ready_future<Ref<PG>>(pg);
+      if (!pg) {
+	if (opref.requires_pg()) {
+	  auto osdmap = target_shard_services.get_map();
+	  if (!osdmap->is_up_acting_osd_shard(
+		opref.get_pgid(), target_shard_services.local_state.whoami)) {
+	    logger.debug(
+	      "pg {} for {} is no longer here, discarding",
+	      opref.get_pgid(), opref);
+	    opref.get_handle().exit();
+	    auto _fut = seastar::now();
+	    if (osdmap->get_epoch() > opref.get_epoch_sent_at()) {
+	      _fut = target_shard_services.send_incremental_map(
+		std::ref(opref.get_foreign_connection()),
+		opref.get_epoch_sent_at() + 1);
+	    }
+	    return _fut;
+	  }
+	}
+	fut = target_shard_services.wait_for_pg(
+	  std::move(trigger), opref.get_pgid());
+      }
+      return fut.safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) {
+	logger.debug("{}: have_pg", opref);
+	return opref.with_pg(target_shard_services, pgref);
+      }).handle_error(
+	crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
+	  logger.debug("{}: pg creation canceled, dropping", opref);
+	  return seastar::now();
+	})
+      );
+    }).then([op=std::move(op)] {});
   }
 
   seastar::future<> load_pgs(crimson::os::FuturizedStore& store);
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
index f09cd147ea9..6c8abecffaf 100644
--- a/src/crimson/osd/replicated_backend.cc
+++ b/src/crimson/osd/replicated_backend.cc
@@ -96,11 +96,18 @@ ReplicatedBackend::submit_transaction(
   bufferlist encoded_txn;
   encode(txn, encoded_txn);
 
+  bool is_delete = false;
   for (auto &le : log_entries) {
     le.mark_unrollbackable();
+    if (le.is_delete()) {
+      is_delete = true;
+    }
   }
 
+  co_await pg.update_snap_map(log_entries, txn);
+
   std::vector<pg_shard_t> to_push_clone;
+  std::vector<pg_shard_t> to_push_delete;
   auto sends = std::make_unique<std::vector<seastar::future<>>>();
   for (auto &pg_shard : pg_shards) {
     if (pg_shard == whoami) {
@@ -115,12 +122,17 @@ ReplicatedBackend::submit_transaction(
       m = new_repop_msg(
 	pg_shard, hoid, encoded_txn, osd_op_p,
 	min_epoch, map_epoch, log_entries, false, tid);
-      if (_new_clone && pg.is_missing_on_peer(pg_shard, hoid)) {
-	// The head is in the push queue but hasn't been pushed yet.
-	// We need to ensure that the newly created clone will be 
-	// pushed as well, otherwise we might skip it.
-	// See: https://tracker.ceph.com/issues/68808
-	to_push_clone.push_back(pg_shard);
+      if (pg.is_missing_on_peer(pg_shard, hoid)) {
+	if (_new_clone) {
+	  // The head is in the push queue but hasn't been pushed yet.
+	  // We need to ensure that the newly created clone will be
+	  // pushed as well, otherwise we might skip it.
+	  // See: https://tracker.ceph.com/issues/68808
+	  to_push_clone.push_back(pg_shard);
+	}
+	if (is_delete) {
+	  to_push_delete.push_back(pg_shard);
+	}
       }
     }
     pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
@@ -130,8 +142,6 @@ ReplicatedBackend::submit_transaction(
 	pg_shard.osd, std::move(m), map_epoch));
   }
 
-  co_await pg.update_snap_map(log_entries, txn);
-
   pg.log_operation(
     std::move(log_entries),
     osd_op_p.pg_trim_to,
@@ -157,7 +167,8 @@ ReplicatedBackend::submit_transaction(
       return seastar::now();
     }
     return peers->all_committed.get_shared_future();
-  }).then_interruptible([pending_txn, this, _new_clone,
+  }).then_interruptible([pending_txn, this, _new_clone, &hoid,
+			to_push_delete=std::move(to_push_delete),
 			to_push_clone=std::move(to_push_clone)] {
     auto acked_peers = std::move(pending_txn->second.acked_peers);
     pending_trans.erase(pending_txn);
@@ -167,6 +178,9 @@ ReplicatedBackend::submit_transaction(
 	_new_clone->obs.oi.version,
 	to_push_clone);
     }
+    if (!to_push_delete.empty()) {
+      pg.enqueue_delete_for_backfill(hoid, {}, to_push_delete);
+    }
     return seastar::make_ready_future<
       crimson::osd::acked_peers_t>(std::move(acked_peers));
   });
diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc
index 76f24196b51..0d6c9d38236 100644
--- a/src/crimson/osd/replicated_recovery_backend.cc
+++ b/src/crimson/osd/replicated_recovery_backend.cc
@@ -35,6 +35,15 @@ ReplicatedRecoveryBackend::recover_object(
     logger().debug("recover_object: loading obc: {}", soid);
     return pg.obc_loader.with_obc<RWState::RWREAD>(soid,
       [this, soid, need](auto head, auto obc) {
+      if (!obc->obs.exists) {
+        // XXX: this recovery must be triggered by backfills and the corresponding
+        //      object must have been deleted by some client request after the object
+        //      is enqueued for push but before the lock is acquired by the recovery.
+        //
+        //      Abort the recovery in this case, a "recover_delete" must have been
+        //      added for this object by the client request that deleted it.
+        return interruptor::now();
+      }
       logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid);
       auto& recovery_waiter = get_recovering(soid);
       recovery_waiter.obc = obc;
@@ -306,7 +315,10 @@ ReplicatedRecoveryBackend::recover_delete(
       }
       return seastar::make_ready_future<>();
     }).then_interruptible([this, soid, &stat_diff] {
-      pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+      const auto &missing = pg.get_peering_state().get_pg_log().get_missing();
+      if (!missing.is_missing(soid)) {
+        pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+      }
       return seastar::make_ready_future<>();
     });
   });
@@ -568,14 +580,17 @@ ReplicatedRecoveryBackend::read_metadata_for_push_op(
     return seastar::make_ready_future<eversion_t>(ver);
   }
   return interruptor::make_interruptible(interruptor::when_all_succeed(
-      backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>(
+      backend->omap_get_header(
+        coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
+      ).handle_error_interruptible<false>(
 	crimson::os::FuturizedStore::Shard::read_errorator::all_same_way(
 	  [oid] (const std::error_code& e) {
 	  logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid);
 	  return seastar::make_ready_future<bufferlist>();
 	})),
-      interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid)))
-      .handle_error_interruptible<false>(
+      interruptor::make_interruptible(
+        store->get_attrs(coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+      ).handle_error_interruptible<false>(
 	crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way(
 	  [oid] (const std::error_code& e) {
 	  logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid);
@@ -613,8 +628,14 @@ ReplicatedRecoveryBackend::read_object_for_push_op(
     return seastar::make_ready_future<uint64_t>(offset);
   }
   // 1. get the extents in the interested range
-  return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid},
-    0, copy_subset.range_end())).safe_then_interruptible(
+  return interruptor::make_interruptible(
+    backend->fiemap(
+      coll,
+      ghobject_t{oid},
+      0,
+      copy_subset.range_end(),
+      CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+  ).safe_then_interruptible(
     [=, this](auto&& fiemap_included) mutable {
     interval_set<uint64_t> extents;
     try {
@@ -630,8 +651,12 @@ ReplicatedRecoveryBackend::read_object_for_push_op(
     push_op->data_included.span_of(extents, offset, max_len);
     // 3. read the truncated extents
     // TODO: check if the returned extents are pruned
-    return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid},
-      push_op->data_included, 0));
+    return interruptor::make_interruptible(
+      store->readv(
+        coll,
+        ghobject_t{oid},
+        push_op->data_included,
+        CEPH_OSD_OP_FLAG_FADVISE_DONTNEED));
   }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) {
     push_op->data.claim_append(std::move(bl));
     uint64_t recovered_to = 0;
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index c2340898929..e1acb34636f 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -783,6 +783,11 @@ seastar::future<> ShardServices::dispatch_context_transaction(
   co_return;
 }
 
+Ref<PG> ShardServices::get_pg(spg_t pgid)
+{
+  return local_state.get_pg(pgid);
+}
+
 seastar::future<> ShardServices::dispatch_context_messages(
   BufferedRecoveryMessages &&ctx)
 {
diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h
index 56ac4963fff..f1ed9b8d911 100644
--- a/src/crimson/osd/shard_services.h
+++ b/src/crimson/osd/shard_services.h
@@ -483,6 +483,8 @@ public:
     return pg_to_shard_mapping.remove_pg_mapping(pgid);
   }
 
+  Ref<PG> get_pg(spg_t pgid);
+
   crimson::common::CephContext *get_cct() {
     return &(local_state.cct);
   }
@@ -589,6 +591,7 @@ public:
 
   FORWARD_TO_OSD_SINGLETON(get_pool_info)
   FORWARD(with_throttle_while, with_throttle_while, local_state.throttler)
+  FORWARD(try_acquire_throttle_now, try_acquire_throttle_now, local_state.throttler)
 
   FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg)
   FORWARD_TO_OSD_SINGLETON(send_incremental_map)
diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc
index 389ecd78afc..870809c5153 100644
--- a/src/crimson/tools/store_nbd/tm_driver.cc
+++ b/src/crimson/tools/store_nbd/tm_driver.cc
@@ -25,6 +25,7 @@ seastar::future<> TMDriver::write(
       return tm->with_transaction_intr(
         Transaction::src_t::MUTATE,
         "write",
+	CACHE_HINT_TOUCH,
         [this, offset, &ptr](auto& t)
       {
         return tm->remove(t, laddr_t::from_byte_offset(offset)
@@ -112,6 +113,7 @@ seastar::future<bufferlist> TMDriver::read(
     return tm->with_transaction_intr(
       Transaction::src_t::READ,
       "read",
+      CACHE_HINT_TOUCH,
       [=, &blret, this](auto& t)
     {
       return read_extents(t, laddr_t::from_byte_offset(offset), size
diff --git a/src/exporter/ceph_exporter.cc b/src/exporter/ceph_exporter.cc
index 44b67c7e615..2232851c094 100644
--- a/src/exporter/ceph_exporter.cc
+++ b/src/exporter/ceph_exporter.cc
@@ -30,13 +30,13 @@ static void handle_signal(int signum)
 static void usage() {
   std::cout << "usage: ceph-exporter [options]\n"
             << "options:\n"
-               "  --sock-dir:     The path to ceph daemons socket files dir\n"
-               "  --addrs:        Host ip address where exporter is deployed\n"
-               "  --port:         Port to deploy exporter on. Default is 9926\n"
-               "  --cert-file:    Path to the certificate file to use https\n"
-               "  --key-file:     Path to the certificate key file to use https\n"
+               "  --sock-dir:     The path to Ceph daemon sockets (*.asok)\n"
+               "  --addrs:        Host IP address on which the exporter is to listen\n"
+               "  --port:         TCP Port on which the exporter is to listen. Default is 9926\n"
+               "  --cert-file:    Path to the certificate file when using HTTPS\n"
+               "  --key-file:     Path to the certificate key file when using HTTPS\n"
                "  --prio-limit:   Only perf counters greater than or equal to prio-limit are fetched. Default: 5\n"
-               "  --stats-period: Time to wait before sending requests again to exporter server (seconds). Default: 5s"
+               "  --stats-period: Interval between daemon scrapes (seconds). Default: 5s"
             << std::endl;
   generic_server_usage();
 }
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index 4a7ac3ea6e0..0f5a9036eff 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -202,6 +202,8 @@ inline namespace v14_2_0 {
     int set_complete_callback(void *cb_arg, callback_t cb);
     int set_safe_callback(void *cb_arg, callback_t cb)
       __attribute__ ((deprecated));
+    /// Request immediate cancellation as if by IoCtx::aio_cancel().
+    int cancel();
     int wait_for_complete();
     int wait_for_safe() __attribute__ ((deprecated));
     int wait_for_complete_and_cb();
@@ -772,17 +774,30 @@ inline namespace v14_2_0 {
     void tier_evict();
   };
 
-  /* IoCtx : This is a context in which we can perform I/O.
-   * It includes a Pool,
+  /**
+   * @brief A handle to a RADOS pool used to perform I/O operations.
    *
    * Typical use (error checking omitted):
-   *
+   * @code
    * IoCtx p;
    * rados.ioctx_create("my_pool", p);
-   * p->stat(&stats);
-   * ... etc ...
+   * p.stat("my_object", &size, &mtime);
+   * @endcode
+   *
+   * IoCtx holds a pointer to its underlying implementation. The dup()
+   * method performs a deep copy of this implementation, but the copy
+   * construction and assignment operations perform shallow copies by
+   * sharing that pointer.
+   *
+   * Function names starting with aio_ are asynchronous operations that
+   * return immediately after submitting a request, and whose completions
+   * are managed by the given AioCompletion pointer. The IoCtx's underlying
+   * implementation is involved in the delivery of these completions, so
+   * the caller must guarantee that its lifetime is preserved until then -
+   * if not by preserving the IoCtx instance that submitted the request,
+   * then by a copied/moved instance that shares the same implementation.
    *
-   * NOTE: be sure to call watch_flush() prior to destroying any IoCtx
+   * @note Be sure to call watch_flush() prior to destroying any IoCtx
    * that is used for watch events to ensure that racing callbacks
    * have completed.
    */
@@ -791,9 +806,13 @@ inline namespace v14_2_0 {
   public:
     IoCtx();
     static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool);
+    /// Construct a shallow copy of rhs, sharing its underlying implementation.
     IoCtx(const IoCtx& rhs);
+    /// Assign a shallow copy of rhs, sharing its underlying implementation.
     IoCtx& operator=(const IoCtx& rhs);
+    /// Move construct from rhs, transferring its underlying implementation.
     IoCtx(IoCtx&& rhs) noexcept;
+    /// Move assign from rhs, transferring its underlying implementation.
     IoCtx& operator=(IoCtx&& rhs) noexcept;
 
     ~IoCtx();
@@ -1150,7 +1169,8 @@ inline namespace v14_2_0 {
     int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts);
 
     /**
-     * Cancel aio operation
+     * Request immediate cancellation with error code -ECANCELED
+     * if the operation hasn't already completed.
      *
      * @param c completion handle
      * @returns 0 on success, negative error code on failure
diff --git a/src/include/random.h b/src/include/random.h
index f2e3e37bcd7..6b7c9405efd 100644
--- a/src/include/random.h
+++ b/src/include/random.h
@@ -16,9 +16,9 @@
 #define CEPH_RANDOM_H 1
 
 #include <mutex>
+#include <optional>
 #include <random>
 #include <type_traits>
-#include <boost/optional.hpp>
 
 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85494
 #ifdef __MINGW32__
@@ -123,7 +123,7 @@ void randomize_rng()
 template <typename EngineT>
 EngineT& engine()
 {
-  thread_local boost::optional<EngineT> rng_engine;
+  thread_local std::optional<EngineT> rng_engine;
 
   if (!rng_engine) {
     rng_engine.emplace(EngineT());
diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h
index 858742d511e..d926840180e 100644
--- a/src/kv/KeyValueDB.h
+++ b/src/kv/KeyValueDB.h
@@ -9,6 +9,7 @@
 #include <map>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <boost/scoped_ptr.hpp>
 #include "include/encoding.h"
 #include "common/Formatter.h"
@@ -211,6 +212,10 @@ public:
       return "";
     }
     virtual ceph::buffer::list value() = 0;
+    // When valid() returns true, value returned as string-view
+    // is guaranteed to be valid until iterator is moved to another
+    // position; that is until call to next() / seek_to_first() / etc.
+    virtual std::string_view value_as_sv() = 0;
     virtual int status() = 0;
     virtual ~SimplestIteratorImpl() {}
   };
@@ -220,7 +225,12 @@ public:
     virtual ~IteratorImpl() {}
     virtual int seek_to_last() = 0;
     virtual int prev() = 0;
+    // When valid() returns true, key returned as string-view
+    // is guaranteed to be valid until iterator is moved to another
+    // position; that is until call to next() / seek_to_first() / etc.
+    virtual std::string_view key_as_sv() = 0;
     virtual std::pair<std::string, std::string> raw_key() = 0;
+    virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0;
     virtual ceph::buffer::ptr value_as_ptr() {
       ceph::buffer::list bl = value();
       if (bl.length() == 1) {
@@ -247,7 +257,9 @@ public:
     virtual int next() = 0;
     virtual int prev() = 0;
     virtual std::string key() = 0;
+    virtual std::string_view key_as_sv() = 0;
     virtual std::pair<std::string,std::string> raw_key() = 0;
+    virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0;
     virtual bool raw_key_is_prefixed(const std::string &prefix) = 0;
     virtual ceph::buffer::list value() = 0;
     virtual ceph::buffer::ptr value_as_ptr() {
@@ -258,6 +270,7 @@ public:
         return ceph::buffer::ptr();
       }
     }
+    virtual std::string_view value_as_sv() = 0;
     virtual int status() = 0;
     virtual size_t key_size() {
       return 0;
@@ -315,15 +328,24 @@ private:
     std::string key() override {
       return generic_iter->key();
     }
+    std::string_view key_as_sv() override {
+      return generic_iter->key_as_sv();
+    }
     std::pair<std::string, std::string> raw_key() override {
       return generic_iter->raw_key();
     }
+    std::pair<std::string_view, std::string_view> raw_key_as_sv() override {
+      return generic_iter->raw_key_as_sv();
+    }
     ceph::buffer::list value() override {
       return generic_iter->value();
     }
     ceph::buffer::ptr value_as_ptr() override {
       return generic_iter->value_as_ptr();
     }
+    std::string_view value_as_sv() override {
+      return generic_iter->value_as_sv();
+    }
     int status() override {
       return generic_iter->status();
     }
diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc
index ca63ea06484..51d224b67c0 100644
--- a/src/kv/RocksDBStore.cc
+++ b/src/kv/RocksDBStore.cc
@@ -6,6 +6,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <string_view>
 #include <errno.h>
 #include <unistd.h>
 #include <sys/types.h>
@@ -47,6 +48,7 @@ using std::ostream;
 using std::pair;
 using std::set;
 using std::string;
+using std::string_view;
 using std::unique_ptr;
 using std::vector;
 
@@ -1992,7 +1994,7 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key)
 
   // Find separator inside Slice
   char* separator = (char*) memchr(in.data(), 0, in.size());
-  if (separator == NULL)
+  if (separator == nullptr)
      return -EINVAL;
   prefix_len = size_t(separator - in.data());
   if (prefix_len >= in.size())
@@ -2006,6 +2008,27 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key)
   return 0;
 }
 
+// TODO: deduplicate the code, preferrably by removing the string variant
+int RocksDBStore::split_key(rocksdb::Slice in, string_view *prefix, string_view *key)
+{
+  size_t prefix_len = 0;
+
+  // Find separator inside Slice
+  char* separator = (char*) memchr(in.data(), 0, in.size());
+  if (separator == nullptr)
+     return -EINVAL;
+  prefix_len = size_t(separator - in.data());
+  if (prefix_len >= in.size())
+    return -EINVAL;
+
+  // Fetch prefix and/or key directly from Slice
+  if (prefix)
+    *prefix = string_view(in.data(), prefix_len);
+  if (key)
+    *key = string_view(separator + 1, in.size() - prefix_len - 1);
+  return 0;
+}
+
 void RocksDBStore::compact()
 {
   dout(2) << __func__ << " starting" << dendl;
@@ -2226,7 +2249,13 @@ int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev()
 string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key()
 {
   string out_key;
-  split_key(dbiter->key(), 0, &out_key);
+  split_key(dbiter->key(), nullptr, &out_key);
+  return out_key;
+}
+string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::key_as_sv()
+{
+  string_view out_key;
+  split_key(dbiter->key(), nullptr, &out_key);
   return out_key;
 }
 pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key()
@@ -2235,6 +2264,12 @@ pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key()
   split_key(dbiter->key(), &prefix, &key);
   return make_pair(prefix, key);
 }
+pair<string_view,string_view> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_as_sv()
+{
+  string_view prefix, key;
+  split_key(dbiter->key(), &prefix, &key);
+  return make_pair(prefix, key);
+}
 
 bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) {
   // Look for "prefix\0" right in rocksb::Slice
@@ -2267,6 +2302,12 @@ bufferptr RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_ptr()
   return bufferptr(val.data(), val.size());
 }
 
+std::string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_sv()
+{
+  rocksdb::Slice val = dbiter->value();
+  return std::string_view{val.data(), val.size()};
+}
+
 int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status()
 {
   return dbiter->status().ok() ? 0 : -1;
@@ -2348,9 +2389,15 @@ public:
   string key() override {
     return dbiter->key().ToString();
   }
+  string_view key_as_sv() override {
+    return dbiter->key().ToStringView();
+  }
   std::pair<std::string, std::string> raw_key() override {
     return make_pair(prefix, key());
   }
+  std::pair<std::string_view, std::string_view> raw_key_as_sv() override {
+    return make_pair(prefix, dbiter->key().ToStringView());
+  }
   bufferlist value() override {
     return to_bufferlist(dbiter->value());
   }
@@ -2358,6 +2405,10 @@ public:
     rocksdb::Slice val = dbiter->value();
     return bufferptr(val.data(), val.size());
   }
+  std::string_view value_as_sv() override {
+    rocksdb::Slice val = dbiter->value();
+    return std::string_view{val.data(), val.size()};
+  }
   int status() override {
     return dbiter->status().ok() ? 0 : -1;
   }
@@ -2668,6 +2719,15 @@ public:
     }
   }
 
+  std::string_view key_as_sv() override
+  {
+    if (smaller == on_main) {
+      return main->key_as_sv();
+    } else {
+      return current_shard->second->key_as_sv();
+    }
+  }
+
   std::pair<std::string,std::string> raw_key() override
   {
     if (smaller == on_main) {
@@ -2677,6 +2737,15 @@ public:
     }
   }
 
+  std::pair<std::string_view,std::string_view> raw_key_as_sv() override
+  {
+    if (smaller == on_main) {
+      return main->raw_key_as_sv();
+    } else {
+      return { current_shard->first, current_shard->second->key_as_sv() };
+    }
+  }
+
   bool raw_key_is_prefixed(const std::string &prefix) override
   {
     if (smaller == on_main) {
@@ -2695,6 +2764,15 @@ public:
     }
   }
 
+  std::string_view value_as_sv() override
+  {
+    if (smaller == on_main) {
+      return main->value_as_sv();
+    } else {
+      return current_shard->second->value_as_sv();
+    }
+  }
+
   int status() override
   {
     //because we already had to inspect key, it must be ok
@@ -3017,9 +3095,15 @@ public:
   string key() override {
     return iters[0]->key().ToString();
   }
+  string_view key_as_sv() override {
+    return iters[0]->key().ToStringView();
+  }
   std::pair<std::string, std::string> raw_key() override {
     return make_pair(prefix, key());
   }
+  std::pair<std::string_view, std::string_view> raw_key_as_sv() override {
+    return make_pair(prefix, iters[0]->key().ToStringView());
+  }
   bufferlist value() override {
     return to_bufferlist(iters[0]->value());
   }
@@ -3027,6 +3111,10 @@ public:
     rocksdb::Slice val = iters[0]->value();
     return bufferptr(val.data(), val.size());
   }
+  std::string_view value_as_sv() override {
+    rocksdb::Slice val = iters[0]->value();
+    return std::string_view{val.data(), val.size()};
+  }
   int status() override {
     return iters[0]->status().ok() ? 0 : -1;
   }
diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h
index 477b209854c..50b91be2bf6 100644
--- a/src/kv/RocksDBStore.h
+++ b/src/kv/RocksDBStore.h
@@ -386,10 +386,13 @@ public:
     int next() override;
     int prev() override;
     std::string key() override;
+    std::string_view key_as_sv() override;
     std::pair<std::string,std::string> raw_key() override;
+    std::pair<std::string_view,std::string_view> raw_key_as_sv() override;
     bool raw_key_is_prefixed(const std::string &prefix) override;
     ceph::bufferlist value() override;
     ceph::bufferptr value_as_ptr() override;
+    std::string_view value_as_sv() override;
     int status() override;
     size_t key_size() override;
     size_t value_size() override;
@@ -419,6 +422,7 @@ public:
   }
 
   static int split_key(rocksdb::Slice in, std::string *prefix, std::string *key);
+  static int split_key(rocksdb::Slice in, std::string_view *prefix, std::string_view *key);
 
   static std::string past_prefix(const std::string &prefix);
 
diff --git a/src/librados/librados_asio.h b/src/librados/librados_asio.h
index 0aedc376575..3e5b7c57c6f 100644
--- a/src/librados/librados_asio.h
+++ b/src/librados/librados_asio.h
@@ -14,6 +14,9 @@
 #ifndef LIBRADOS_ASIO_H
 #define LIBRADOS_ASIO_H
 
+#include <boost/asio/associated_cancellation_slot.hpp>
+#include <boost/asio/cancellation_type.hpp>
+
 #include "include/rados/librados.hpp"
 #include "common/async/completion.h"
 #include "librados/AioCompletionImpl.h"
@@ -74,6 +77,7 @@ struct Invoker<void> {
 template <typename Result>
 struct AsyncOp : Invoker<Result> {
   unique_aio_completion_ptr aio_completion;
+  boost::asio::cancellation_slot slot;
 
   using Signature = typename Invoker<Result>::Signature;
   using Completion = ceph::async::Completion<Signature, AsyncOp<Result>>;
@@ -83,6 +87,7 @@ struct AsyncOp : Invoker<Result> {
     auto p = std::unique_ptr<Completion>{static_cast<Completion*>(arg)};
     // move result out of Completion memory being freed
     auto op = std::move(p->user_data);
+    op.slot.clear(); // clear our cancellation handler
     // access AioCompletionImpl directly to avoid locking
     const librados::AioCompletionImpl* pc = op.aio_completion->pc;
     const int ret = pc->rval;
@@ -94,11 +99,46 @@ struct AsyncOp : Invoker<Result> {
     op.dispatch(std::move(p), ec, ver);
   }
 
+  struct op_cancellation {
+    AioCompletion* completion = nullptr;
+    bool is_read = false;
+
+    void operator()(boost::asio::cancellation_type type) {
+      if (completion == nullptr) {
+        return; // no AioCompletion attached
+      } else if (type == boost::asio::cancellation_type::none) {
+        return; // no cancellation requested
+      } else if (is_read) {
+        // read operations produce no side effects, so can satisfy the
+        // requirements of 'total' cancellation. the weaker requirements
+        // of 'partial' and 'terminal' are also satisfied
+        completion->cancel();
+      } else if (type == boost::asio::cancellation_type::terminal) {
+        // write operations only support 'terminal' cancellation because we
+        // can't guarantee that no osd has succeeded (or will succeed) in
+        // applying the write
+        completion->cancel();
+      }
+    }
+  };
+
   template <typename Executor1, typename CompletionHandler>
-  static auto create(const Executor1& ex1, CompletionHandler&& handler) {
+  static auto create(const Executor1& ex1, bool is_read,
+                     CompletionHandler&& handler) {
+    op_cancellation* cancel_handler = nullptr;
+    auto slot = boost::asio::get_associated_cancellation_slot(handler);
+    if (slot.is_connected()) {
+      cancel_handler = &slot.template emplace<op_cancellation>();
+    }
+
     auto p = Completion::create(ex1, std::move(handler));
     p->user_data.aio_completion.reset(
         Rados::aio_create_completion(p.get(), aio_dispatch));
+    if (cancel_handler) {
+      cancel_handler->completion = p->user_data.aio_completion.get();
+      cancel_handler->is_read = is_read;
+      p->user_data.slot = std::move(slot);
+    }
     return p;
   }
 };
@@ -108,6 +148,9 @@ struct AsyncOp : Invoker<Result> {
 
 /// Calls IoCtx::aio_read() and arranges for the AioCompletion to call a
 /// given handler with signature (error_code, version_t, bufferlist).
+///
+/// The given IoCtx reference is not required to remain valid, but some IoCtx
+/// instance must preserve its underlying implementation until completion.
 template <typename ExecutionContext, typename CompletionToken>
 auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                 size_t len, uint64_t off, CompletionToken&& token)
@@ -117,7 +160,8 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
   return boost::asio::async_initiate<CompletionToken, Signature>(
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
           size_t len, uint64_t off) {
-        auto p = Op::create(ex, std::move(handler));
+        constexpr bool is_read = true;
+        auto p = Op::create(ex, is_read, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_read(oid, op.aio_completion.get(), &op.result, len, off);
@@ -132,6 +176,9 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 
 /// Calls IoCtx::aio_write() and arranges for the AioCompletion to call a
 /// given handler with signature (error_code, version_t).
+///
+/// The given IoCtx reference is not required to remain valid, but some IoCtx
+/// instance must preserve its underlying implementation until completion.
 template <typename ExecutionContext, typename CompletionToken>
 auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                  const bufferlist &bl, size_t len, uint64_t off,
@@ -142,7 +189,8 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
   return boost::asio::async_initiate<CompletionToken, Signature>(
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
           const bufferlist &bl, size_t len, uint64_t off) {
-        auto p = Op::create(ex, std::move(handler));
+        constexpr bool is_read = false;
+        auto p = Op::create(ex, is_read, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_write(oid, op.aio_completion.get(), bl, len, off);
@@ -157,6 +205,9 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 
 /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a
 /// given handler with signature (error_code, version_t, bufferlist).
+///
+/// The given IoCtx reference is not required to remain valid, but some IoCtx
+/// instance must preserve its underlying implementation until completion.
 template <typename ExecutionContext, typename CompletionToken>
 auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                    ObjectReadOperation *read_op, int flags,
@@ -167,7 +218,8 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
   return boost::asio::async_initiate<CompletionToken, Signature>(
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
           ObjectReadOperation *read_op, int flags) {
-        auto p = Op::create(ex, std::move(handler));
+        constexpr bool is_read = true;
+        auto p = Op::create(ex, is_read, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_operate(oid, op.aio_completion.get(), read_op,
@@ -183,6 +235,9 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 
 /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a
 /// given handler with signature (error_code, version_t).
+///
+/// The given IoCtx reference is not required to remain valid, but some IoCtx
+/// instance must preserve its underlying implementation until completion.
 template <typename ExecutionContext, typename CompletionToken>
 auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                    ObjectWriteOperation *write_op, int flags,
@@ -194,7 +249,8 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
           ObjectWriteOperation *write_op, int flags,
           const jspan_context* trace_ctx) {
-        auto p = Op::create(ex, std::move(handler));
+        constexpr bool is_read = false;
+        auto p = Op::create(ex, is_read, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_operate(oid, op.aio_completion.get(), write_op, flags, trace_ctx);
@@ -209,6 +265,9 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
 
 /// Calls IoCtx::aio_notify() and arranges for the AioCompletion to call a
 /// given handler with signature (error_code, version_t, bufferlist).
+///
+/// The given IoCtx reference is not required to remain valid, but some IoCtx
+/// instance must preserve its underlying implementation until completion.
 template <typename ExecutionContext, typename CompletionToken>
 auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
                   bufferlist& bl, uint64_t timeout_ms, CompletionToken &&token)
@@ -218,7 +277,8 @@ auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid,
   return boost::asio::async_initiate<CompletionToken, Signature>(
       [] (auto handler, auto ex, IoCtx& io, const std::string& oid,
           bufferlist& bl, uint64_t timeout_ms) {
-        auto p = Op::create(ex, std::move(handler));
+        constexpr bool is_read = false;
+        auto p = Op::create(ex, is_read, std::move(handler));
         auto& op = p->user_data;
 
         int ret = io.aio_notify(oid, op.aio_completion.get(),
diff --git a/src/librados/librados_cxx.cc b/src/librados/librados_cxx.cc
index 2167eeade3c..60217b99b41 100644
--- a/src/librados/librados_cxx.cc
+++ b/src/librados/librados_cxx.cc
@@ -1103,6 +1103,14 @@ void librados::AioCompletion::release()
   delete this;
 }
 
+int librados::AioCompletion::cancel()
+{
+  if (!pc->io) {
+    return 0; // no operation was started
+  }
+  return pc->io->aio_cancel(pc);
+}
+
 ///////////////////////////// IoCtx //////////////////////////////
 librados::IoCtx::IoCtx() : io_ctx_impl(NULL)
 {
diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h
index 177447c2cb3..a47db3a47ef 100644
--- a/src/mgr/PyModule.h
+++ b/src/mgr/PyModule.h
@@ -161,9 +161,9 @@ public:
   }
 
   const std::string &get_name() const {
-    std::lock_guard l(lock) ; return module_name;
+    return module_name;
   }
-  const std::string &get_error_string() const {
+  std::string get_error_string() const {
     std::lock_guard l(lock) ; return error_string;
   }
   bool get_can_run() const {
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index 719403925ad..2d2735f1e7c 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -171,6 +171,8 @@ int NVMeofGwMap::cfg_delete_gw(
             << state.availability <<  " Resulting GW availability: "
             << state.availability  << dendl;
         state.subsystems.clear();//ignore subsystems of this GW
+        utime_t now = ceph_clock_now();
+        mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now;
         return 0;
       }
     }
@@ -895,10 +897,12 @@ struct CMonRequestProposal : public Context {
   }
 };
 
-void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const 
+void NVMeofGwMap::get_health_checks(health_check_map_t *checks) 
 {
   list<string> singleGatewayDetail;
   list<string> gatewayDownDetail;
+  list<string> gatewayInDeletingDetail;
+  int deleting_gateways = 0;
   for (const auto& created_map_pair: created_gws) {
     const auto& group_key = created_map_pair.first;
     auto& group = group_key.second;
@@ -915,9 +919,37 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
         ostringstream ss;
         ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
         gatewayDownDetail.push_back(ss.str());
+      } else if (gw_created.availability == gw_availability_t::GW_DELETING) {
+        deleting_gateways++;
+        utime_t now = ceph_clock_now();
+        bool found_deleting_time = false;
+        auto gws_deleting_time = mon->nvmegwmon()->gws_deleting_time;
+        auto group_it = gws_deleting_time.find(group_key);
+        if (group_it != gws_deleting_time.end()) {
+          auto& gw_map = group_it->second;
+          auto gw_it = gw_map.find(gw_id);
+          if (gw_it != gw_map.end()) {
+            found_deleting_time = true;
+            utime_t delete_time = gw_it->second;
+            if ((now - delete_time) > g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_delete_grace").count()) {
+              ostringstream ss;
+              ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state.";
+              gatewayInDeletingDetail.push_back(ss.str());
+            }
+          }
+        }
+        if (!found_deleting_time) {
+          // DELETING gateway not found in gws_deleting_time, set timeout now 
+          mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now; 
+        }
       }
     }
   }
+  if (deleting_gateways == 0) {
+    // no gateway in GW_DELETING state currently, flush old gws_deleting_time
+    mon->nvmegwmon()->gws_deleting_time.clear();
+  }
+
   if (!singleGatewayDetail.empty()) {
     ostringstream ss;
     ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
@@ -934,6 +966,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
         ss.str(), gatewayDownDetail.size());
     d.detail.swap(gatewayDownDetail);
   }
+  if (!gatewayInDeletingDetail.empty()) {
+    ostringstream ss;
+    ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state"
+      << "; namespaces are automatically balanced across remaining gateways, "
+      << "this should take a few minutes.";
+    auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN,
+        ss.str(), gatewayInDeletingDetail.size());
+    d.detail.swap(gatewayInDeletingDetail);
+  }
 }
 
 int NVMeofGwMap::blocklist_gw(
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 5f657733012..85fd62b3a07 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -144,7 +144,7 @@ public:
     DECODE_FINISH(bl);
   }
 
-  void get_health_checks(health_check_map_t *checks) const;
+  void get_health_checks(health_check_map_t *checks);
 };
 
 #include "NVMeofGwSerialize.h"
diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h
index 7fae8b766a5..d7f5fd89cde 100644
--- a/src/mon/NVMeofGwMon.h
+++ b/src/mon/NVMeofGwMon.h
@@ -82,6 +82,8 @@ public:
   void check_subs(bool type);
   void check_sub(Subscription *sub);
 
+  std::map<NvmeGroupKey, std::map<NvmeGwId, utime_t>> gws_deleting_time;
+
 private:
   void synchronize_last_beacon();
   void process_gw_down(const NvmeGwId &gw_id,
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index 7da9a67be62..65627b5f818 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -519,6 +519,11 @@ bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
   return cur_iter->value();
 }
 
+std::string_view DBObjectMap::DBObjectMapIteratorImpl::value_as_sv()
+{
+  return cur_iter->value_as_sv();
+}
+
 int DBObjectMap::DBObjectMapIteratorImpl::status()
 {
   return r;
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index 444f21eb815..1e1452010e7 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -393,6 +393,7 @@ private:
     int next() override { ceph_abort(); return 0; }
     std::string key() override { ceph_abort(); return ""; }
     ceph::buffer::list value() override { ceph_abort(); return ceph::buffer::list(); }
+    std::string_view value_as_sv() override { ceph_abort(); return std::string_view(); }
     int status() override { return 0; }
   };
 
@@ -431,6 +432,7 @@ private:
     int next() override;
     std::string key() override;
     ceph::buffer::list value() override;
+    std::string_view value_as_sv() override;
     int status() override;
 
     bool on_parent() {
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 521435b6c31..df3ae920a2f 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -29,6 +29,7 @@
 
 #include <errno.h>
 #include <sys/stat.h>
+#include <functional>
 #include <map>
 #include <memory>
 #include <vector>
@@ -735,15 +736,6 @@ public:
     std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
     ) = 0;
 
-#ifdef WITH_SEASTAR
-  virtual int omap_get_values(
-    CollectionHandle &c,         ///< [in] Collection containing oid
-    const ghobject_t &oid,       ///< [in] Object containing omap
-    const std::optional<std::string> &start_after,     ///< [in] Keys to get
-    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
-    ) = 0;
-#endif
-
   /// Filters keys into out which are defined on oid
   virtual int omap_check_keys(
     CollectionHandle &c,     ///< [in] Collection containing oid
@@ -766,6 +758,48 @@ public:
     const ghobject_t &oid  ///< [in] object
     ) = 0;
 
+  struct omap_iter_seek_t {
+    std::string seek_position;
+    enum {
+      // start with provided key (seek_position), if it exists
+      LOWER_BOUND,
+      // skip provided key (seek_position) even if it exists
+      UPPER_BOUND
+    } seek_type = LOWER_BOUND;
+    static omap_iter_seek_t min_lower_bound() { return {}; }
+  };
+  enum class omap_iter_ret_t {
+    STOP,
+    NEXT
+  };
+  /**
+   * Iterate over object map with user-provided callable
+   *
+   * Warning!  The callable is executed under lock on bluestore
+   * operations in c.  Do not use bluestore methods on c while
+   * iterating. (Filling in a transaction is no problem).
+   *
+   * @param c collection
+   * @param oid object
+   * @param start_from where the iterator should point to at
+   *                   the beginning
+   * @param visitor callable that takes OMAP key and corresponding
+   *                value as string_views and controls iteration
+   *                by the return. It is executed for every object's
+   *                OMAP entry from `start_from` till end of the
+   *                object's OMAP or till the iteration is stopped
+   *                by `STOP`. Please note that if there is no such
+   *                entry, `visitor` will be called 0 times.
+   * @return error code, zero on success
+   */
+  virtual int omap_iterate(
+    CollectionHandle &c,
+    const ghobject_t &oid,
+    omap_iter_seek_t start_from,
+    std::function<omap_iter_ret_t(std::string_view,
+                                  std::string_view)> visitor
+  ) = 0;
+
   virtual int flush_journal() { return -EOPNOTSUPP; }
 
   virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; }
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index a024a0c2105..25e6c4fe596 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -4830,7 +4830,7 @@ void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
   out->append(old.c_str() + out->length(), old.size() - out->length());
 }
 
-void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
+size_t BlueStore::Onode::calc_userkey_offset_in_omap_key() const
 {
   size_t pos = sizeof(uint64_t) + 1;
   if (!onode.is_pgmeta_omap()) {
@@ -4840,9 +4840,15 @@ void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
       pos += sizeof(uint64_t);
     }
   }
-  *user_key = key.substr(pos);
+  return pos;
 }
 
+void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
+{
+  *user_key = key.substr(calc_userkey_offset_in_omap_key());
+}
+
+
 void BlueStore::Onode::finish_write(TransContext* txc, uint32_t offset, uint32_t length)
 {
   while (true) {
@@ -5519,7 +5525,13 @@ BlueStore::OmapIteratorImpl::OmapIteratorImpl(
   if (o->onode.has_omap()) {
     o->get_omap_key(string(), &head);
     o->get_omap_tail(&tail);
+    auto start1 = mono_clock::now();
     it->lower_bound(head);
+    c->store->log_latency(
+    __func__,
+    l_bluestore_omap_seek_to_first_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
   }
 }
 BlueStore::OmapIteratorImpl::~OmapIteratorImpl()
@@ -5654,6 +5666,13 @@ bufferlist BlueStore::OmapIteratorImpl::value()
   return it->value();
 }
 
+std::string_view BlueStore::OmapIteratorImpl::value_as_sv()
+{
+  std::shared_lock l(c->lock);
+  ceph_assert(it->valid());
+  return it->value_as_sv();
+}
+
 
 // =====================================
 
@@ -13601,52 +13620,6 @@ int BlueStore::omap_get_values(
   return r;
 }
 
-#ifdef WITH_SEASTAR
-int BlueStore::omap_get_values(
-  CollectionHandle &c_,        ///< [in] Collection containing oid
-  const ghobject_t &oid,       ///< [in] Object containing omap
-  const std::optional<string> &start_after,     ///< [in] Keys to get
-  map<string, bufferlist> *output ///< [out] Returned keys and values
-  )
-{
-  Collection *c = static_cast<Collection *>(c_.get());
-  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
-  if (!c->exists)
-    return -ENOENT;
-  std::shared_lock l(c->lock);
-  int r = 0;
-  OnodeRef o = c->get_onode(oid, false);
-  if (!o || !o->exists) {
-    r = -ENOENT;
-    goto out;
-  }
-  if (!o->onode.has_omap()) {
-    goto out;
-  }
-  o->flush();
-  {
-    ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
-    if (!iter) {
-      r = -ENOENT;
-      goto out;
-    }
-    if (start_after) {
-      iter->upper_bound(*start_after);
-    } else {
-      iter->seek_to_first();
-    }
-    for (; iter->valid(); iter->next()) {
-      output->insert(make_pair(iter->key(), iter->value()));
-    }
-  }
-
-out:
-  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
-          << dendl;
-  return r;
-}
-#endif
-
 int BlueStore::omap_check_keys(
   CollectionHandle &c_,    ///< [in] Collection containing oid
   const ghobject_t &oid,   ///< [in] Object containing omap
@@ -13724,6 +13697,94 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
   return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it));
 }
 
+int BlueStore::omap_iterate(
+  CollectionHandle &c_,   ///< [in] collection
+  const ghobject_t &oid, ///< [in] object
+  ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+  std::function<omap_iter_ret_t(std::string_view, std::string_view)> f
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+  if (!c->exists) {
+    return -ENOENT;
+  }
+  std::shared_lock l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+    return -ENOENT;
+  }
+  o->flush();
+  dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
+  if (!o->onode.has_omap()) {
+    // nothing to do
+    return 0;
+  }
+
+  KeyValueDB::Iterator it;
+  {
+    auto bounds = KeyValueDB::IteratorBounds();
+    std::string lower_bound, upper_bound;
+    o->get_omap_key(string(), &lower_bound);
+    o->get_omap_tail(&upper_bound);
+    bounds.lower_bound = std::move(lower_bound);
+    bounds.upper_bound = std::move(upper_bound);
+    it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
+  }
+
+  // seek the iterator
+  {
+    std::string key;
+    o->get_omap_key(start_from.seek_position, &key);
+    auto start = ceph::mono_clock::now();
+    if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) {
+      it->lower_bound(key);
+      c->store->log_latency(
+        __func__,
+        l_bluestore_omap_lower_bound_lat,
+        ceph::mono_clock::now() - start,
+        c->store->cct->_conf->bluestore_log_omap_iterator_age);
+    } else {
+      it->upper_bound(key);
+      c->store->log_latency(
+        __func__,
+        l_bluestore_omap_upper_bound_lat,
+        ceph::mono_clock::now() - start,
+        c->store->cct->_conf->bluestore_log_omap_iterator_age);
+    }
+  }
+
+  // iterate!
+  std::string tail;
+  o->get_omap_tail(&tail);
+  const std::string_view::size_type userkey_offset_in_dbkey =
+    o->calc_userkey_offset_in_omap_key();
+  ceph::timespan next_lat_acc{0};
+  while (it->valid()) {
+    const auto& db_key = it->raw_key_as_sv().second;
+    if (db_key >= tail) {
+      break;
+    }
+    std::string_view user_key = db_key.substr(userkey_offset_in_dbkey);
+    omap_iter_ret_t ret = f(user_key, it->value_as_sv());
+    if (ret == omap_iter_ret_t::STOP) {
+      break;
+    } else if (ret == omap_iter_ret_t::NEXT) {
+      ceph::time_guard<ceph::mono_clock>{next_lat_acc};
+      it->next();
+    } else {
+      ceph_abort();
+    }
+  }
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_next_lat,
+    next_lat_acc,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+  return 0;
+}
+
 // -----------------
 // write helpers
 
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index 99f8d057cf0..5549f97ffea 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -1457,6 +1457,7 @@ public:
     }
 
     void rewrite_omap_key(const std::string& old, std::string *out);
+    size_t calc_userkey_offset_in_omap_key() const;
     void decode_omap_key(const std::string& key, std::string *user_key);
 
     void finish_write(TransContext* txc, uint32_t offset, uint32_t length);
@@ -1753,6 +1754,7 @@ public:
     int next() override;
     std::string key() override;
     ceph::buffer::list value() override;
+    std::string_view value_as_sv() override;
     std::string tail_key() override {
       return tail;
     }
@@ -3416,15 +3418,6 @@ public:
     std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
     ) override;
 
-#ifdef WITH_SEASTAR
-  int omap_get_values(
-    CollectionHandle &c,         ///< [in] Collection containing oid
-    const ghobject_t &oid,       ///< [in] Object containing omap
-    const std::optional<std::string> &start_after,     ///< [in] Keys to get
-    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
-    ) override;
-#endif
-
   /// Filters keys into out which are defined on oid
   int omap_check_keys(
     CollectionHandle &c,                ///< [in] Collection containing oid
@@ -3438,6 +3431,13 @@ public:
     const ghobject_t &oid  ///< [in] object
     ) override;
 
+  int omap_iterate(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+    std::function<omap_iter_ret_t(std::string_view, std::string_view)> f
+  ) override;
+
   void set_fsid(uuid_d u) override {
     fsid = u;
   }
diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc
index 7158486ca38..a069d429155 100644
--- a/src/os/kstore/KStore.cc
+++ b/src/os/kstore/KStore.cc
@@ -1651,6 +1651,13 @@ bufferlist KStore::OmapIteratorImpl::value()
   return it->value();
 }
 
+std::string_view KStore::OmapIteratorImpl::value_as_sv()
+{
+  std::shared_lock l{c->lock};
+  ceph_assert(it->valid());
+  return it->value_as_sv();
+}
+
 int KStore::omap_get(
   CollectionHandle& ch,                ///< [in] Collection containing oid
   const ghobject_t &oid,   ///< [in] Object containing omap
@@ -1866,6 +1873,71 @@ ObjectMap::ObjectMapIterator KStore::get_omap_iterator(
   return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
 }
 
+int KStore::omap_iterate(
+  CollectionHandle &ch,   ///< [in] collection
+  const ghobject_t &oid, ///< [in] object
+  ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+  std::function<omap_iter_ret_t(std::string_view, std::string_view)> f)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  {
+    std::shared_lock l{c->lock};
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+      return -ENOENT;
+    }
+    o->flush();
+    dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl;
+
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    std::string tail;
+    std::string seek_key;
+    if (o->onode.omap_head) {
+      return 0; // nothing to do
+    }
+
+    // acquire data depedencies for seek & iterate
+    get_omap_key(o->onode.omap_head, start_from.seek_position, &seek_key);
+    get_omap_tail(o->onode.omap_head, &tail);
+
+    // acquire the iterator
+    {
+      it = db->get_iterator(PREFIX_OMAP);
+    }
+
+    // seek the iterator
+    {
+      if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) {
+        it->lower_bound(seek_key);
+      } else {
+        it->upper_bound(seek_key);
+      }
+    }
+
+    // iterate!
+    while (it->valid()) {
+      std::string user_key;
+      if (const auto& db_key = it->raw_key().second; db_key >= tail) {
+        break;
+      } else {
+        decode_omap_key(db_key, &user_key);
+      }
+      omap_iter_ret_t ret = f(user_key, it->value_as_sv());
+      if (ret == omap_iter_ret_t::STOP) {
+        break;
+      } else if (ret == omap_iter_ret_t::NEXT) {
+        it->next();
+      } else {
+        ceph_abort();
+      }
+    }
+  }
+  return 0;
+}
+
 
 // -----------------
 // write helpers
diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h
index 9a9d413c66a..06115d3cab7 100644
--- a/src/os/kstore/KStore.h
+++ b/src/os/kstore/KStore.h
@@ -180,6 +180,7 @@ public:
     int next() override;
     std::string key() override;
     ceph::buffer::list value() override;
+    std::string_view value_as_sv() override;
     int status() override {
       return 0;
     }
@@ -553,6 +554,13 @@ public:
     const ghobject_t &oid  ///< [in] object
     ) override;
 
+  int omap_iterate(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+    std::function<omap_iter_ret_t(std::string_view, std::string_view)> f
+  ) override;
+
   void set_fsid(uuid_d u) override {
     fsid = u;
   }
diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc
index 89cb09361cf..f9d3bf0d8a2 100644
--- a/src/os/memstore/MemStore.cc
+++ b/src/os/memstore/MemStore.cc
@@ -537,30 +537,6 @@ int MemStore::omap_get_values(
   return 0;
 }
 
-#ifdef WITH_SEASTAR
-int MemStore::omap_get_values(
-  CollectionHandle& ch,                    ///< [in] Collection containing oid
-  const ghobject_t &oid,       ///< [in] Object containing omap
-  const std::optional<std::string> &start_after,     ///< [in] Keys to get
-  std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
-  )
-{
-  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
-  Collection *c = static_cast<Collection*>(ch.get());
-  ObjectRef o = c->get_object(oid);
-  if (!o)
-    return -ENOENT;
-  assert(start_after);
-  std::lock_guard lock{o->omap_mutex};
-  for (auto it = o->omap.upper_bound(*start_after);
-       it != std::end(o->omap);
-       ++it) {
-    out->insert(*it);
-  }
-  return 0;
-}
-#endif
-
 int MemStore::omap_check_keys(
   CollectionHandle& ch,                ///< [in] Collection containing oid
   const ghobject_t &oid,   ///< [in] Object containing omap
@@ -622,6 +598,10 @@ public:
     std::lock_guard lock{o->omap_mutex};
     return it->second;
   }
+  std::string_view value_as_sv() override {
+    std::lock_guard lock{o->omap_mutex};
+    return std::string_view{it->second.c_str(), it->second.length()};
+  }
   int status() override {
     return 0;
   }
@@ -639,6 +619,48 @@ ObjectMap::ObjectMapIterator MemStore::get_omap_iterator(
   return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o));
 }
 
+int MemStore::omap_iterate(
+  CollectionHandle &ch,   ///< [in] collection
+  const ghobject_t &oid, ///< [in] object
+  ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+  std::function<omap_iter_ret_t(std::string_view, std::string_view)> f)
+{
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o) {
+    return -ENOENT;
+  }
+
+  {
+    std::lock_guard lock{o->omap_mutex};
+
+    // obtain seek the iterator
+    decltype(o->omap)::iterator it;
+    {
+      if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) {
+        it = o->omap.lower_bound(start_from.seek_position);
+      } else {
+        it = o->omap.upper_bound(start_from.seek_position);
+      }
+    }
+
+    // iterate!
+    while (it != o->omap.end()) {
+      // potentially rectifying memcpy but who cares for memstore?
+      omap_iter_ret_t ret =
+        f(it->first, std::string_view{it->second.c_str(), it->second.length()});
+      if (ret == omap_iter_ret_t::STOP) {
+        break;
+      } else if (ret == omap_iter_ret_t::NEXT) {
+        ++it;
+      } else {
+        ceph_abort();
+      }
+    }
+  }
+  return 0;
+}
+
 
 // ---------------
 // write operations
diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h
index 2abe552891f..9621773598f 100644
--- a/src/os/memstore/MemStore.h
+++ b/src/os/memstore/MemStore.h
@@ -363,14 +363,6 @@ public:
     const std::set<std::string> &keys,     ///< [in] Keys to get
     std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
     ) override;
-#ifdef WITH_SEASTAR
-  int omap_get_values(
-    CollectionHandle &c,         ///< [in] Collection containing oid
-    const ghobject_t &oid,       ///< [in] Object containing omap
-    const std::optional<std::string> &start_after,     ///< [in] Keys to get
-    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
-    ) override;
-#endif
 
   using ObjectStore::omap_check_keys;
   /// Filters keys into out which are defined on oid
@@ -387,6 +379,13 @@ public:
     const ghobject_t &oid  ///< [in] object
     ) override;
 
+  int omap_iterate(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid, ///< [in] object
+    omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+    std::function<omap_iter_ret_t(std::string_view, std::string_view)> f
+  ) override;
+
   void set_fsid(uuid_d u) override;
   uuid_d get_fsid() override;
 
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index fa2570aba42..8630b038812 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -945,6 +945,10 @@ void ECBackend::handle_sub_write(
   }
   trace.event("handle_sub_write");
 
+  if (cct->_conf->bluestore_debug_inject_read_err &&
+      ec_inject_test_write_error3(op.soid)) {
+    ceph_abort_msg("Error inject - OSD down");
+  }
   if (!get_parent()->pgb_is_primary())
     get_parent()->update_stats(op.stats);
   ObjectStore::Transaction localt;
@@ -1191,6 +1195,15 @@ void ECBackend::handle_sub_write_reply(
     i->second->on_all_commit = 0;
     i->second->trace.event("ec write all committed");
   }
+  if (cct->_conf->bluestore_debug_inject_read_err &&
+      (i->second->pending_commit.size() == 1) &&
+      ec_inject_test_write_error2(i->second->hoid)) {
+    std::string cmd =
+      "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( get_parent()->whoami() ) + "\"] }";
+    vector<std::string> vcmd{cmd};
+    dout(0) << __func__ << " Error inject - marking OSD down" << dendl;
+    get_parent()->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
+  }
   rmw_pipeline.check_ops();
 }
 
@@ -1208,6 +1221,19 @@ void ECBackend::handle_sub_read_reply(
     return;
   }
   ReadOp &rop = iter->second;
+  if (cct->_conf->bluestore_debug_inject_read_err) {
+    for (auto i = op.buffers_read.begin();
+	 i != op.buffers_read.end();
+	 ++i) {
+      if (ec_inject_test_read_error0(ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) {
+	dout(0) << __func__ << " Error inject - EIO error for shard " << op.from.shard << dendl;
+	op.buffers_read.erase(i->first);
+	op.attrs_read.erase(i->first);
+	op.errors[i->first] = -EIO;
+      }
+
+    }
+  }
   for (auto i = op.buffers_read.begin();
        i != op.buffers_read.end();
        ++i) {
diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc
index 609ac3141ae..59077547fcb 100644
--- a/src/osd/ECCommon.cc
+++ b/src/osd/ECCommon.cc
@@ -226,8 +226,14 @@ void ECCommon::ReadPipeline::get_all_avail_shards(
        ++i) {
     dout(10) << __func__ << ": checking acting " << *i << dendl;
     const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
-    if (error_shards.find(*i) != error_shards.end())
+    if (error_shards.contains(*i)) {
       continue;
+    }
+    if (cct->_conf->bluestore_debug_inject_read_err &&
+        ec_inject_test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, i->shard))) {
+      dout(0) << __func__ << " Error inject - Missing shard " << i->shard << dendl;
+      continue;
+    }
     if (!missing.is_missing(hoid)) {
       ceph_assert(!have.count(i->shard));
       have.insert(i->shard);
@@ -912,6 +918,11 @@ bool ECCommon::RMWPipeline::try_reads_to_commit()
     if (*i == get_parent()->whoami_shard()) {
       should_write_local = true;
       local_write_op.claim(sop);
+    } else if (cct->_conf->bluestore_debug_inject_read_err &&
+	       ec_inject_test_write_error1(ghobject_t(op->hoid,
+		 ghobject_t::NO_GEN, i->shard))) {
+      dout(0) << " Error inject - Dropping write message to shard " <<
+	i->shard << dendl;
     } else {
       MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
       r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
@@ -1090,3 +1101,305 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info(
   }
   return ref;
 }
+
+// Error inject interfaces
+static ceph::recursive_mutex ec_inject_lock =
+  ceph::make_recursive_mutex("ECCommon::ec_inject_lock");
+static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_read_failures0;
+static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_read_failures1;
+static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures0;
+static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures1;
+static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures2;
+static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures3;
+static std::map<ghobject_t,shard_id_t> ec_inject_write_failures0_shard;
+static std::set<osd_reqid_t> ec_inject_write_failures0_reqid;
+
+/**
+ * Configure a read error inject that typically forces additional reads of
+ * shards in an EC pool to recover data using the redundancy. With multiple
+ * errors it is possible to force client reads to fail.
+ *
+ * Type 0 - Simulate a medium error. Fail a read with -EIO to force
+ * additional reads and a decode
+ *
+ * Type 1 - Simulate a missing OSD. Dont even try to read a shard
+ *
+ * @brief Set up a read error inject for an object in an EC pool.
+ * @param o Target object for the error inject.
+ * @param when Error inject starts after this many object store reads.
+ * @param duration Error inject affects this many object store reads.
+ * @param type Type of error inject 0 = EIO, 1 = missing shard.
+ * @return string Result of configuring the error inject.
+ */
+std::string ec_inject_read_error(const ghobject_t& o,
+				 const int64_t type,
+				 const int64_t when,
+				 const int64_t duration) {
+  std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock);
+  ghobject_t os = o;
+  if (os.hobj.oid.name == "*") {
+    os.hobj.set_hash(0);
+  }
+  switch (type) {
+  case 0:
+    ec_inject_read_failures0[os] = std::pair(when, duration);
+    return "ok - read returns EIO";
+  case 1:
+    ec_inject_read_failures1[os] = std::pair(when, duration);
+    return "ok - read pretends shard is missing";
+  default:
+    break;
+  }
+  return "unrecognized error inject type";
+}
+
+/**
+ * Configure a write error inject that either fails an OSD or causes a
+ * client write operation to be rolled back.
+ *
+ * Type 0 - Tests rollback. Drop a write I/O to a shard, then simulate an OSD
+ * down to force rollback to occur, lastly fail the retried write from the
+ * client so the results of the rollback can be inspected.
+ *
+ * Type 1 - Drop a write I/O to a shard. Used on its own this will hang a
+ * write I/O.
+ *
+ * Type 2 - Simulate an OSD down (ceph osd down) to force a new epoch. Usually
+ * used together with type 1 to force a rollback
+ *
+ * Type 3 - Abort when an OSD processes a write I/O to a shard. Typically the
+ * client write will be commited while the OSD is absent which will result in
+ * recovery or backfill later when the OSD returns.
+ *
+ * @brief Set up a write error inject for an object in an EC pool.
+ * @param o Target object for the error inject.
+ * @param when Error inject starts after this many object store reads.
+ * @param duration Error inject affects this many object store reads.
+ * @param type Type of error inject 0 = EIO, 1 = missing shard.
+ * @return string Result of configuring the error inect.
+ */
+std::string ec_inject_write_error(const ghobject_t& o,
+				  const int64_t type,
+				  const int64_t when,
+				  const int64_t duration) {
+  std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock);
+  std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures;
+  ghobject_t os = o;
+  bool no_shard = true;
+  std::string result;
+  switch (type) {
+  case 0:
+    failures = &ec_inject_write_failures0;
+    result = "ok - drop write, sim OSD down and fail client retry with EINVAL";
+    break;
+  case 1:
+    failures = &ec_inject_write_failures1;
+    no_shard = false;
+    result = "ok - drop write to shard";
+    break;
+  case 2:
+    failures = &ec_inject_write_failures2;
+    result = "ok - inject OSD down";
+    break;
+  case 3:
+    if (duration != 1) {
+      return "duration must be 1";
+    }
+    failures = &ec_inject_write_failures3;
+    result = "ok - write abort OSDs";
+    break;
+  default:
+    return "unrecognized error inject type";
+  }
+  if (no_shard) {
+    os.set_shard(shard_id_t::NO_SHARD);
+  }
+  if (os.hobj.oid.name == "*") {
+    os.hobj.set_hash(0);
+  }
+  (*failures)[os] = std::pair(when, duration);
+  if (type == 0) {
+    ec_inject_write_failures0_shard[os] = o.shard_id;
+  }
+  return result;
+}
+
+/**
+ * @brief Clear a previously configured read error inject.
+ * @param o Target object for the error inject.
+ * @param type Type of error inject 0 = EIO, 1 = missing shard.
+ * @return string Indication of how many errors were cleared.
+ */
+std::string ec_inject_clear_read_error(const ghobject_t& o,
+				       const int64_t type) {
+  std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock);
+  std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures;
+  ghobject_t os = o;
+  int64_t remaining = 0;
+  switch (type) {
+  case 0:
+    failures = &ec_inject_read_failures0;
+    break;
+  case 1:
+    failures = &ec_inject_read_failures1;
+    break;
+  default:
+    return "unrecognized error inject type";
+  }
+  if (os.hobj.oid.name == "*") {
+    os.hobj.set_hash(0);
+  }
+  auto it = failures->find(os);
+  if (it != failures->end()) {
+    remaining = it->second.second;
+    failures->erase(it);
+  }
+  if (remaining == 0) {
+    return "no outstanding error injects";
+  } else if (remaining == 1) {
+    return "ok - 1 inject cleared";
+  }
+  return "ok - " + std::to_string(remaining) + " injects cleared";
+}
+
+/**
+ * @brief Clear a previously configured write error inject.
+ * @param o Target object for the error inject.
+ * @param type Type of error inject 0 = EIO, 1 = missing shard.
+ * @return string Indication of how many errors were cleared.
+ */
+std::string ec_inject_clear_write_error(const ghobject_t& o,
+					const int64_t type) {
+  std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock);
+  std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures;
+  ghobject_t os = o;
+  bool no_shard = true;
+  int64_t remaining = 0;
+  switch (type) {
+  case 0:
+    failures = &ec_inject_write_failures0;
+    break;
+  case 1:
+    failures = &ec_inject_write_failures1;
+    no_shard = false;
+    break;
+  case 2:
+    failures = &ec_inject_write_failures2;
+    break;
+  case 3:
+    failures = &ec_inject_write_failures3;
+    break;
+  default:
+    return "unrecognized error inject type";
+  }
+  if (no_shard) {
+    os.set_shard(shard_id_t::NO_SHARD);
+  }
+  if (os.hobj.oid.name == "*") {
+    os.hobj.set_hash(0);
+  }
+  auto it = failures->find(os);
+  if (it != failures->end()) {
+    remaining = it->second.second;
+    failures->erase(it);
+    if (type == 0) {
+      ec_inject_write_failures0_shard.erase(os);
+    }
+  }
+  if (remaining == 0) {
+    return "no outstanding error injects";
+  } else if (remaining == 1) {
+    return "ok - 1 inject cleared";
+  }
+  return "ok - " + std::to_string(remaining) + " injects cleared";
+}
+
+static bool ec_inject_test_error(const ghobject_t& o,
+  std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures)
+{
+  std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock);
+  auto it = failures->find(o);
+  if (it == failures->end()) {
+    ghobject_t os = o;
+    os.hobj.oid.name = "*";
+    os.hobj.set_hash(0);
+    it = failures->find(os);
+  }
+  if (it != failures->end()) {
+    auto && [when,duration] = it->second;
+    if (when > 0) {
+      when--;
+      return false;
+    }
+    if (--duration <= 0) {
+      failures->erase(it);
+    }
+    return true;
+  }
+  return false;
+}
+
+bool ec_inject_test_read_error0(const ghobject_t& o)
+{
+  return ec_inject_test_error(o, &ec_inject_read_failures0);
+}
+
+bool ec_inject_test_read_error1(const ghobject_t& o)
+{
+  return ec_inject_test_error(o, &ec_inject_read_failures1);
+}
+
+bool ec_inject_test_write_error0(const hobject_t& o,
+				 const osd_reqid_t& reqid) {
+  std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock);
+  ghobject_t os = ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD);
+  if (ec_inject_write_failures0_reqid.count(reqid)) {
+    // Matched reqid of retried write - flag for failure
+    ec_inject_write_failures0_reqid.erase(reqid);
+    return true;
+  }
+  auto it = ec_inject_write_failures0.find(os);
+  if (it == ec_inject_write_failures0.end()) {
+    os.hobj.oid.name = "*";
+    os.hobj.set_hash(0);
+    it = ec_inject_write_failures0.find(os);
+  }
+  if (it != ec_inject_write_failures0.end()) {
+    auto && [when, duration] = it->second;
+    auto shard = ec_inject_write_failures0_shard.find(os)->second;
+    if (when > 0) {
+      when--;
+    } else {
+      if (--duration <= 0) {
+	ec_inject_write_failures0.erase(it);
+	ec_inject_write_failures0_shard.erase(os);
+      }
+      // Error inject triggered - save reqid
+      ec_inject_write_failures0_reqid.insert(reqid);
+      // Set up error inject to drop message to primary
+      ec_inject_write_error(ghobject_t(o, ghobject_t::NO_GEN, shard), 1, 0, 1);
+    }
+  }
+  return false;
+}
+
+bool ec_inject_test_write_error1(const ghobject_t& o) {
+  bool rc = ec_inject_test_error(o, &ec_inject_write_failures1);
+  if (rc) {
+    // Set up error inject to generate OSD down
+    ec_inject_write_error(o, 2, 0, 1);
+  }
+  return rc;
+}
+
+bool ec_inject_test_write_error2(const hobject_t& o) {
+  return ec_inject_test_error(
+    ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+    &ec_inject_write_failures2);
+}
+
+bool ec_inject_test_write_error3(const hobject_t& o) {
+  return ec_inject_test_error(
+    ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+    &ec_inject_write_failures3);
+}
diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h
index 7ff9cae7646..de4c11ad50f 100644
--- a/src/osd/ECCommon.h
+++ b/src/osd/ECCommon.h
@@ -493,6 +493,7 @@ struct ECCommon {
       ); ///< @return error code, 0 on success
 
     void schedule_recovery_work();
+
   };
 
   /**
@@ -843,3 +844,15 @@ void ECCommon::ReadPipeline::filter_read_op(
     on_schedule_recovery(op);
   }
 }
+
+// Error inject interfaces
+std::string ec_inject_read_error(const ghobject_t& o, const int64_t type, const int64_t when, const int64_t duration);
+std::string ec_inject_write_error(const ghobject_t& o, const int64_t type, const int64_t when, const int64_t duration);
+std::string ec_inject_clear_read_error(const ghobject_t& o, const int64_t type);
+std::string ec_inject_clear_write_error(const ghobject_t& o, const int64_t type);
+bool ec_inject_test_read_error0(const ghobject_t& o);
+bool ec_inject_test_read_error1(const ghobject_t& o);
+bool ec_inject_test_write_error0(const hobject_t& o,const osd_reqid_t& reqid);
+bool ec_inject_test_write_error1(const ghobject_t& o);
+bool ec_inject_test_write_error2(const hobject_t& o);
+bool ec_inject_test_write_error3(const hobject_t& o);
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 5223eb283e9..9c9e540cf61 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -37,6 +37,7 @@
 #include "osd/PG.h"
 #include "osd/scrubber/scrub_machine.h"
 #include "osd/scrubber/pg_scrubber.h"
+#include "osd/ECCommon.h"
 
 #include "include/types.h"
 #include "include/compat.h"
@@ -4348,6 +4349,46 @@ void OSD::final_init()
     "inject metadata error to an object");
   ceph_assert(r == 0);
   r = admin_socket->register_command(
+    "injectecreaderr " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=true,range=0|255 " \
+    "name=type,type=CephInt,req=false " \
+    "name=when,type=CephInt,req=false " \
+    "name=duration,type=CephInt,req=false",
+    test_ops_hook,
+    "inject error for read of object in an EC pool");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "injectecclearreaderr " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=true,range=0|255 " \
+    "name=type,type=CephInt,req=false",
+    test_ops_hook,
+    "clear read error injects for object in an EC pool");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "injectecwriteerr " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=true,range=0|255 " \
+    "name=type,type=CephInt,req=false " \
+    "name=when,type=CephInt,req=false " \
+    "name=duration,type=CephInt,req=false",
+    test_ops_hook,
+    "inject error for write of object in an EC pool");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "injectecclearwriteerr " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=true,range=0|255 " \
+    "name=type,type=CephInt,req=false",
+    test_ops_hook,
+    "clear write error inject for object in an EC pool");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
     "set_recovery_delay " \
     "name=utime,type=CephInt,req=false",
     test_ops_hook,
@@ -6487,8 +6528,10 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
   //directly request the osd make a change.
   if (command == "setomapval" || command == "rmomapkey" ||
       command == "setomapheader" || command == "getomap" ||
-      command == "truncobj" || command == "injectmdataerr" ||
-      command == "injectdataerr"
+      command == "truncobj" ||
+      command == "injectmdataerr" || command == "injectdataerr" ||
+      command == "injectecreaderr" || command == "injectecclearreaderr" ||
+      command == "injectecwriteerr" || command == "injectecclearwriteerr"
     ) {
     pg_t rawpg;
     int64_t pool;
@@ -6527,8 +6570,21 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
     if (curmap->pg_is_ec(rawpg)) {
-        if ((command != "injectdataerr") && (command != "injectmdataerr")) {
-            ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
+        if ((command != "injectdataerr") &&
+	    (command != "injectmdataerr") &&
+	    (command != "injectecreaderr") &&
+	    (command != "injectecclearreaderr") &&
+	    (command != "injectecwriteerr") &&
+	    (command != "injectecclearwriteerr")) {
+            ss << "Must not call on ec pool";
+            return;
+        }
+    } else {
+        if ((command == "injectecreaderr") ||
+	    (command == "injecteclearreaderr") ||
+	    (command == "injectecwriteerr") ||
+	    (command == "injecteclearwriteerr")) {
+            ss << "Only supported on ec pool";
             return;
         }
     }
@@ -6607,6 +6663,38 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
     } else if (command == "injectmdataerr") {
       store->inject_mdata_error(gobj);
       ss << "ok";
+    } else if (command == "injectecreaderr") {
+      if (service->cct->_conf->bluestore_debug_inject_read_err) {
+	int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0);
+        int64_t when = cmd_getval_or<int64_t>(cmdmap, "when", 0);
+        int64_t duration = cmd_getval_or<int64_t>(cmdmap, "duration", 1);
+	ss << ec_inject_read_error(gobj, type, when, duration);
+      } else {
+	ss << "bluestore_debug_inject_read_err not enabled";
+      }
+    } else if (command == "injectecclearreaderr") {
+      if (service->cct->_conf->bluestore_debug_inject_read_err) {
+	int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0);
+	ss << ec_inject_clear_read_error(gobj, type);
+      } else {
+	ss << "bluestore_debug_inject_read_err not enabled";
+      }
+    } else if (command == "injectecwriteerr") {
+      if (service->cct->_conf->bluestore_debug_inject_read_err) {
+	int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0);
+	int64_t when = cmd_getval_or<int64_t>(cmdmap, "when", 0);
+        int64_t duration = cmd_getval_or<int64_t>(cmdmap, "duration", 1);
+	ss << ec_inject_write_error(gobj, type, when, duration);
+      } else {
+	ss << "bluestore_debug_inject_read_err not enabled";
+      }
+    } else if (command == "injectecclearwriteerr") {
+      if (service->cct->_conf->bluestore_debug_inject_read_err) {
+	int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0);
+	ss << ec_inject_clear_write_error(gobj, type);
+      } else {
+	ss << "bluestore_debug_inject_read_err not enabled";
+      }
     }
     return;
   }
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index b87484c1a9d..9b3593d54e5 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -1642,12 +1642,10 @@ void OSDMap::get_out_of_subnet_osd_counts(CephContext *cct,
   for (int i = 0; i < max_osd; i++) {
     if (exists(i) && is_up(i)) {
       if (const auto& addrs = get_addrs(i).v; addrs.size() >= 2) {
-        auto v1_addr = addrs[0].ip_only_to_str();
-        if (!is_addr_in_subnet(cct, public_network, v1_addr)) {
+        if (!is_addr_in_subnet(cct, public_network, addrs[0])) {
           unreachable->emplace(i);
         }
-        auto v2_addr = addrs[1].ip_only_to_str();
-        if (!is_addr_in_subnet(cct, public_network, v2_addr)) {
+        if (!is_addr_in_subnet(cct, public_network, addrs[1])) {
           unreachable->emplace(i);
         }
       }
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index b87aa1da677..f5eb9ea951e 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -290,6 +290,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        MessageRef, Connection *con) = 0;
      virtual void send_message_osd_cluster(
        Message *m, const ConnectionRef& con) = 0;
+     virtual void start_mon_command(
+       const std::vector<std::string>& cmd, const bufferlist& inbl,
+       bufferlist *outbl, std::string *outs,
+       Context *onfinish) = 0;
      virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
      virtual entity_name_t get_cluster_msgr_name() = 0;
 
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 44f8e85b5ef..3324ba9dc91 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -2286,6 +2286,16 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
     }
   }
 
+  if (cct->_conf->bluestore_debug_inject_read_err &&
+      op->may_write() &&
+      pool.info.is_erasure() &&
+      ec_inject_test_write_error0(m->get_hobj(), m->get_reqid())) {
+    // Fail retried write with error
+    dout(0) << __func__ << " Error inject - Fail retried write with EINVAL" << dendl;
+    osd->reply_op_error(op, -EINVAL);
+    return;
+  }
+
   ObjectContextRef obc;
   bool can_create = op->may_write();
   hobject_t missing_oid;
@@ -5798,10 +5808,19 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
 
 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
 {
-  for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
-    char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
-    if (osd_op.indata[idx] != read_byte) {
-        return (-MAX_ERRNO - idx);
+  auto input_iter = osd_op.indata.begin();
+  auto read_iter = read_bl.begin();
+  uint64_t idx = 0;
+
+  while (input_iter != osd_op.indata.end()) {
+    char read_byte = (read_iter != read_bl.end() ? *read_iter : 0);
+    if (*input_iter != read_byte) {
+      return (-MAX_ERRNO - idx);
+    }
+    ++idx;
+    ++input_iter;
+    if (read_iter != read_bl.end()) {
+      ++read_iter;
     }
   }
 
@@ -7767,27 +7786,34 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	bool truncated = false;
 	bufferlist bl;
 	if (oi.is_omap()) {
-	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
-	    ch, ghobject_t(soid)
-	    );
-          if (!iter) {
-            result = -ENOENT;
-            goto fail;
-          }
-	  iter->upper_bound(start_after);
-	  if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
-	  for (num = 0;
-	       iter->valid() &&
-		 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
-	       ++num, iter->next()) {
-	    dout(20) << "Found key " << iter->key() << dendl;
-	    if (num >= max_return ||
-		bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
-	      truncated = true;
-	      break;
-	    }
-	    encode(iter->key(), bl);
-	    encode(iter->value(), bl);
+	  using omap_iter_seek_t = ObjectStore::omap_iter_seek_t;
+	  result = osd->store->omap_iterate(
+	    ch, ghobject_t(soid),
+	    // try to seek as many keys-at-once as possible for the sake of performance.
+	    // note complexity should be logarithmic, so seek(n/2) + seek(n/2) is worse
+	    // than just seek(n).
+	    ObjectStore::omap_iter_seek_t{
+	      .seek_position = std::max(start_after, filter_prefix),
+	      .seek_type = filter_prefix > start_after ? omap_iter_seek_t::LOWER_BOUND
+						       : omap_iter_seek_t::UPPER_BOUND
+	    },
+	    [&bl, &truncated, &filter_prefix, &num, max_return,
+	     max_bytes=cct->_conf->osd_max_omap_bytes_per_request]
+	    (std::string_view key, std::string_view value) mutable {
+	      if (key.substr(0, filter_prefix.size()) != filter_prefix) {
+	        return ObjectStore::omap_iter_ret_t::STOP;
+	      }
+	      if (num >= max_return || bl.length() >= max_bytes) {
+	        truncated = true;
+	        return ObjectStore::omap_iter_ret_t::STOP;
+	      }
+	      encode(key, bl);
+	      encode(value, bl);
+	      ++num;
+	      return ObjectStore::omap_iter_ret_t::NEXT;
+	    });
+	  if (result < 0) {
+	    goto fail;
 	  }
 	} // else return empty out_set
 	encode(num, osd_op.outdata);
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index f66b5c6e16a..bf55d539821 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -622,6 +622,12 @@ public:
     Message *m, const ConnectionRef& con) override {
     osd->send_message_osd_cluster(m, con);
   }
+  void start_mon_command(
+    const std::vector<std::string>& cmd, const bufferlist& inbl,
+    bufferlist *outbl, std::string *outs,
+    Context *onfinish) override {
+    osd->monc->start_mon_command(cmd, inbl, outbl, outs, onfinish);
+  }
   ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override;
   entity_name_t get_cluster_msgr_name() override {
     return osd->get_cluster_msgr_name();
@@ -1993,6 +1999,7 @@ public:
 
 private:
   DynamicPerfStats m_dynamic_perf_stats;
+
 };
 
 inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop)
@@ -2021,5 +2028,4 @@ inline ostream& operator<<(ostream& out,
 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop);
 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop);
 
-
 #endif
diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py
index f1c56d75378..550604fc55b 100644
--- a/src/pybind/mgr/cephadm/inventory.py
+++ b/src/pybind/mgr/cephadm/inventory.py
@@ -2036,8 +2036,8 @@ class CertKeyStore():
             var = service_name if entity in self.service_name_cert else host
             j = {}
             self.known_certs[entity][var] = cert_obj
-            for service_name in self.known_certs[entity].keys():
-                j[var] = Cert.to_json(self.known_certs[entity][var])
+            for cert_key in self.known_certs[entity]:
+                j[cert_key] = Cert.to_json(self.known_certs[entity][cert_key])
         else:
             self.known_certs[entity] = cert_obj
             j = Cert.to_json(cert_obj)
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index bf14f8d1715..6690153d435 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -2460,7 +2460,7 @@ Then run the following:
 
     @handle_orch_error
     def service_action(self, action: str, service_name: str) -> List[str]:
-        if service_name not in self.spec_store.all_specs.keys():
+        if service_name not in self.spec_store.all_specs.keys() and service_name != 'osd':
             raise OrchestratorError(f'Invalid service name "{service_name}".'
                                     + ' View currently running services using "ceph orch ls"')
         dds: List[DaemonDescription] = self.cache.get_daemons_by_service(service_name)
@@ -3925,6 +3925,50 @@ Then run the following:
         return self.to_remove_osds.all_osds()
 
     @handle_orch_error
+    def set_osd_spec(self, service_name: str, osd_ids: List[str]) -> str:
+        """
+        Update unit.meta file for osd with service name
+        """
+        if service_name not in self.spec_store:
+            raise OrchestratorError(f"Cannot find service '{service_name}' in the inventory. "
+                                    "Please try again after applying an OSD service that matches "
+                                    "the service name to which you want to attach OSDs.")
+
+        daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('osd')
+        update_osd = defaultdict(list)
+        for daemon in daemons:
+            if daemon.daemon_id in osd_ids and daemon.hostname:
+                update_osd[daemon.hostname].append(daemon.daemon_id)
+
+        if not update_osd:
+            raise OrchestratorError(f"Unable to find OSDs: {osd_ids}")
+
+        failed_osds = []
+        success_osds = []
+        for host in update_osd:
+            osds = ",".join(update_osd[host])
+            # run cephadm command with all host osds on specific host,
+            # if it fails, continue with other hosts
+            try:
+                with self.async_timeout_handler(host):
+                    outs, errs, _code = self.wait_async(
+                        CephadmServe(self)._run_cephadm(host,
+                                                        cephadmNoImage,
+                                                        'update-osd-service',
+                                                        ['--service-name', service_name, '--osd-ids', osds]))
+                    if _code:
+                        self.log.error(f"Failed to update service for {osds} osd. Cephadm error: {errs}")
+                        failed_osds.extend(update_osd[host])
+                    else:
+                        success_osds.extend(update_osd[host])
+            except Exception:
+                self.log.exception(f"Failed to set service name for {osds}")
+                failed_osds.extend(update_osd[host])
+            self.cache.invalidate_host_daemons(host)
+        self._kick_serve_loop()
+        return f"Updated service for osd {','.join(success_osds)}" + (f" and failed for {','.join(failed_osds)}" if failed_osds else "")
+
+    @handle_orch_error
     @host_exists()
     def drain_host(self, hostname: str, force: bool = False, keep_conf_keyring: bool = False, zap_osd_devices: bool = False) -> str:
         """
diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py
index 98d2fe99897..04d3712c50a 100644
--- a/src/pybind/mgr/cephadm/schedule.py
+++ b/src/pybind/mgr/cephadm/schedule.py
@@ -385,6 +385,8 @@ class HostAssignment(object):
 
     def find_ip_on_host(self, hostname: str, subnets: List[str]) -> Optional[str]:
         for subnet in subnets:
+            # to normalize subnet
+            subnet = str(ipaddress.ip_network(subnet))
             ips: List[str] = []
             # following is to allow loopback interfaces for both ipv4 and ipv6. Since we
             # only have the subnet (and no IP) we assume default loopback IP address.
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py
index 04f5af28a9b..4f83d7bb0fb 100644
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -1157,6 +1157,14 @@ class RgwService(CephService):
                 'value': str(spec.rgw_bucket_counters_cache_size),
             })
 
+        if getattr(spec, 'disable_multisite_sync_traffic', None) is not None:
+            ret, out, err = self.mgr.check_mon_command({
+                'prefix': 'config set',
+                'who': daemon_name,
+                'name': 'rgw_run_sync_thread',
+                'value': 'false' if spec.disable_multisite_sync_traffic else 'true',
+            })
+
         daemon_spec.keyring = keyring
         daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
 
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index 1b9cf618570..9c5b5a112f3 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -3,6 +3,7 @@ import logging
 import os
 import socket
 from typing import List, Any, Tuple, Dict, Optional, cast
+import ipaddress
 
 from mgr_module import HandleCommandResult
 
@@ -57,6 +58,8 @@ class GrafanaService(CephadmService):
             if ip_to_bind_to:
                 daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to}
                 grafana_ip = ip_to_bind_to
+                if ipaddress.ip_network(grafana_ip).version == 6:
+                    grafana_ip = f"[{grafana_ip}]"
 
         domain = self.mgr.get_fqdn(daemon_spec.host)
         mgmt_gw_ips = []
@@ -354,6 +357,13 @@ class AlertmanagerService(CephadmService):
             addr = self.mgr.get_fqdn(dd.hostname)
             peers.append(build_url(host=addr, port=port).lstrip('/'))
 
+        ip_to_bind_to = ''
+        if spec.only_bind_port_on_networks and spec.networks:
+            assert daemon_spec.host is not None
+            ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or ''
+            if ip_to_bind_to:
+                daemon_spec.port_ips = {str(port): ip_to_bind_to}
+
         deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}')
         if security_enabled:
             alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
@@ -376,7 +386,8 @@ class AlertmanagerService(CephadmService):
                 },
                 'peers': peers,
                 'web_config': '/etc/alertmanager/web.yml',
-                'use_url_prefix': mgmt_gw_enabled
+                'use_url_prefix': mgmt_gw_enabled,
+                'ip_to_bind_to': ip_to_bind_to
             }, sorted(deps)
         else:
             return {
@@ -384,7 +395,8 @@ class AlertmanagerService(CephadmService):
                     "alertmanager.yml": yml
                 },
                 "peers": peers,
-                'use_url_prefix': mgmt_gw_enabled
+                'use_url_prefix': mgmt_gw_enabled,
+                'ip_to_bind_to': ip_to_bind_to
             }, sorted(deps)
 
     def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
diff --git a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2
index de993cb6ce3..b6955caf616 100644
--- a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2
+++ b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2
@@ -8,6 +8,8 @@ global:
     tls_config:
 {% if security_enabled %}
       ca_file: root_cert.pem
+      cert_file: alertmanager.crt
+      key_file: alertmanager.key
 {% else %}
       insecure_skip_verify: true
 {% endif %}
diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
index b9773ceeeb3..14af0fd48ca 100644
--- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2
@@ -9,6 +9,7 @@ events {
 http {
 
     #access_log /dev/stdout;
+    error_log /dev/stderr info;
     client_header_buffer_size 32K;
     large_client_header_buffers 4 32k;
     proxy_busy_buffers_size 512k;
diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
index ecfd899af71..961da145dac 100644
--- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
+++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
@@ -28,6 +28,8 @@ alerting:
             password: {{ service_discovery_password }}
           tls_config:
             ca_file: root_cert.pem
+            cert_file: prometheus.crt
+            key_file:  prometheus.key
 {% else %}
     - scheme: http
       http_sd_configs:
@@ -56,6 +58,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     honor_labels: true
     http_sd_configs:
@@ -81,6 +85,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     http_sd_configs:
     - url: {{ node_exporter_sd_url }}
@@ -104,6 +110,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     http_sd_configs:
     - url: {{ haproxy_sd_url }}
@@ -128,6 +136,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     honor_labels: true
     http_sd_configs:
@@ -149,6 +159,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     http_sd_configs:
     - url: {{ nvmeof_sd_url }}
@@ -169,6 +181,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     http_sd_configs:
     - url: {{ nfs_sd_url }}
@@ -189,6 +203,8 @@ scrape_configs:
         password: {{ service_discovery_password }}
       tls_config:
         ca_file: root_cert.pem
+        cert_file: prometheus.crt
+        key_file:  prometheus.key
 {% else %}
     http_sd_configs:
     - url: {{ smb_sd_url }}
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index b81510504d9..22bd26def91 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -1741,16 +1741,23 @@ class TestCephadm(object):
         nvmeof_client_cert = 'fake-nvmeof-client-cert'
         nvmeof_server_cert = 'fake-nvmeof-server-cert'
         nvmeof_root_ca_cert = 'fake-nvmeof-root-ca-cert'
+        grafana_cert_host_1 = 'grafana-cert-host-1'
+        grafana_cert_host_2 = 'grafana-cert-host-2'
         cephadm_module.cert_key_store.save_cert('rgw_frontend_ssl_cert', rgw_frontend_rgw_foo_host2_cert, service_name='rgw.foo', user_made=True)
         cephadm_module.cert_key_store.save_cert('nvmeof_server_cert', nvmeof_server_cert, service_name='nvmeof.foo', user_made=True)
         cephadm_module.cert_key_store.save_cert('nvmeof_client_cert', nvmeof_client_cert, service_name='nvmeof.foo', user_made=True)
         cephadm_module.cert_key_store.save_cert('nvmeof_root_ca_cert', nvmeof_root_ca_cert, service_name='nvmeof.foo', user_made=True)
+        cephadm_module.cert_key_store.save_cert('grafana_cert', grafana_cert_host_1, host='host-1', user_made=True)
+        cephadm_module.cert_key_store.save_cert('grafana_cert', grafana_cert_host_2, host='host-2', user_made=True)
 
         expected_calls = [
             mock.call(f'{CERT_STORE_CERT_PREFIX}rgw_frontend_ssl_cert', json.dumps({'rgw.foo': Cert(rgw_frontend_rgw_foo_host2_cert, True).to_json()})),
             mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_server_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_server_cert, True).to_json()})),
             mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_client_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_client_cert, True).to_json()})),
             mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_root_ca_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_root_ca_cert, True).to_json()})),
+            mock.call(f'{CERT_STORE_CERT_PREFIX}grafana_cert', json.dumps({'host-1': Cert(grafana_cert_host_1, True).to_json()})),
+            mock.call(f'{CERT_STORE_CERT_PREFIX}grafana_cert', json.dumps({'host-1': Cert(grafana_cert_host_1, True).to_json(),
+                                                                           'host-2': Cert(grafana_cert_host_2, True).to_json()}))
         ]
         _set_store.assert_has_calls(expected_calls)
 
@@ -1795,17 +1802,20 @@ class TestCephadm(object):
         cephadm_module.cert_key_store._init_known_cert_key_dicts()
 
         grafana_host1_key = 'fake-grafana-host1-key'
+        grafana_host2_key = 'fake-grafana-host2-key'
         nvmeof_client_key = 'nvmeof-client-key'
         nvmeof_server_key = 'nvmeof-server-key'
         nvmeof_encryption_key = 'nvmeof-encryption-key'
-        grafana_host1_key = 'fake-grafana-host1-cert'
         cephadm_module.cert_key_store.save_key('grafana_key', grafana_host1_key, host='host1')
+        cephadm_module.cert_key_store.save_key('grafana_key', grafana_host2_key, host='host2')
         cephadm_module.cert_key_store.save_key('nvmeof_client_key', nvmeof_client_key, service_name='nvmeof.foo')
         cephadm_module.cert_key_store.save_key('nvmeof_server_key', nvmeof_server_key, service_name='nvmeof.foo')
         cephadm_module.cert_key_store.save_key('nvmeof_encryption_key', nvmeof_encryption_key, service_name='nvmeof.foo')
 
         expected_calls = [
             mock.call(f'{CERT_STORE_KEY_PREFIX}grafana_key', json.dumps({'host1': PrivKey(grafana_host1_key).to_json()})),
+            mock.call(f'{CERT_STORE_KEY_PREFIX}grafana_key', json.dumps({'host1': PrivKey(grafana_host1_key).to_json(),
+                                                                         'host2': PrivKey(grafana_host2_key).to_json()})),
             mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_client_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_client_key).to_json()})),
             mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_server_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_server_key).to_json()})),
             mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_encryption_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_encryption_key).to_json()})),
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index 0d89657ac8c..d872219df80 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -581,7 +581,14 @@ class TestMonitoring:
         mock_getfqdn.return_value = purl.hostname
 
         with with_host(cephadm_module, "test"):
-            with with_service(cephadm_module, AlertManagerSpec()):
+            cephadm_module.cache.update_host_networks('test', {
+                '1.2.3.0/24': {
+                    'if0': ['1.2.3.1']
+                },
+            })
+            with with_service(cephadm_module, AlertManagerSpec('alertmanager',
+                                                               networks=['1.2.3.0/24'],
+                                                               only_bind_port_on_networks=True)):
                 y = dedent(self._get_config(expected_yaml_url)).lstrip()
                 _run_cephadm.assert_called_with(
                     'test',
@@ -595,11 +602,12 @@ class TestMonitoring:
                         "deploy_arguments": [],
                         "params": {
                             'tcp_ports': [9093, 9094],
+                            'port_ips': {"9094": "1.2.3.1"},
                         },
                         "meta": {
                             'service_name': 'alertmanager',
                             'ports': [9093, 9094],
-                            'ip': None,
+                            'ip': '1.2.3.1',
                             'deployed_by': [],
                             'rank': None,
                             'rank_generation': None,
@@ -612,6 +620,7 @@ class TestMonitoring:
                             },
                             "peers": [],
                             "use_url_prefix": False,
+                            "ip_to_bind_to": "1.2.3.1",
                         }
                     }),
                     error_ok=True,
@@ -634,8 +643,16 @@ class TestMonitoring:
             cephadm_module.secure_monitoring_stack = True
             cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user')
             cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password')
+
+            cephadm_module.cache.update_host_networks('test', {
+                'fd12:3456:789a::/64': {
+                    'if0': ['fd12:3456:789a::10']
+                },
+            })
             with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \
-                 with_service(cephadm_module, AlertManagerSpec()):
+                 with_service(cephadm_module, AlertManagerSpec('alertmanager',
+                                                               networks=['fd12:3456:789a::/64'],
+                                                               only_bind_port_on_networks=True)):
 
                 y = dedent("""
                 # This file is generated by cephadm.
@@ -646,6 +663,8 @@ class TestMonitoring:
                   http_config:
                     tls_config:
                       ca_file: root_cert.pem
+                      cert_file: alertmanager.crt
+                      key_file: alertmanager.key
 
                 route:
                   receiver: 'default'
@@ -686,11 +705,12 @@ class TestMonitoring:
                         "deploy_arguments": [],
                         "params": {
                             'tcp_ports': [9093, 9094],
+                            'port_ips': {"9094": "fd12:3456:789a::10"}
                         },
                         "meta": {
                             'service_name': 'alertmanager',
                             'ports': [9093, 9094],
-                            'ip': None,
+                            'ip': 'fd12:3456:789a::10',
                             'deployed_by': [],
                             'rank': None,
                             'rank_generation': None,
@@ -708,6 +728,7 @@ class TestMonitoring:
                             'peers': [],
                             'web_config': '/etc/alertmanager/web.yml',
                             "use_url_prefix": True,
+                            "ip_to_bind_to": "fd12:3456:789a::10",
                         }
                     }),
                     error_ok=True,
@@ -741,6 +762,8 @@ class TestMonitoring:
                   http_config:
                     tls_config:
                       ca_file: root_cert.pem
+                      cert_file: alertmanager.crt
+                      key_file: alertmanager.key
 
                 route:
                   receiver: 'default'
@@ -801,6 +824,7 @@ class TestMonitoring:
                             'peers': [],
                             'web_config': '/etc/alertmanager/web.yml',
                             "use_url_prefix": False,
+                            "ip_to_bind_to": "",
                         }
                     }),
                     error_ok=True,
@@ -1170,6 +1194,8 @@ class TestMonitoring:
                             password: sd_password
                           tls_config:
                             ca_file: root_cert.pem
+                            cert_file: prometheus.crt
+                            key_file:  prometheus.key
 
                 scrape_configs:
                   - job_name: 'ceph'
@@ -1191,6 +1217,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                   - job_name: 'node'
                     relabel_configs:
@@ -1209,6 +1237,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                   - job_name: 'haproxy'
                     relabel_configs:
@@ -1225,6 +1255,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                   - job_name: 'ceph-exporter'
                     relabel_configs:
@@ -1242,6 +1274,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                   - job_name: 'nvmeof'
                     honor_labels: true
@@ -1255,6 +1289,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                   - job_name: 'nfs'
                     honor_labels: true
@@ -1268,6 +1304,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                   - job_name: 'smb'
                     honor_labels: true
@@ -1281,6 +1319,8 @@ class TestMonitoring:
                         password: sd_password
                       tls_config:
                         ca_file: root_cert.pem
+                        cert_file: prometheus.crt
+                        key_file:  prometheus.key
 
                 """).lstrip()
 
@@ -2071,6 +2111,26 @@ class TestRGWService:
                 })
                 assert f == expected
 
+    @pytest.mark.parametrize(
+        "disable_sync_traffic",
+        [
+            (True),
+            (False),
+        ]
+    )
+    @patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+    def test_rgw_disable_sync_traffic(self, disable_sync_traffic, cephadm_module: CephadmOrchestrator):
+        with with_host(cephadm_module, 'host1'):
+            s = RGWSpec(service_id="foo",
+                        disable_multisite_sync_traffic=disable_sync_traffic)
+            with with_service(cephadm_module, s) as dds:
+                _, f, _ = cephadm_module.check_mon_command({
+                    'prefix': 'config get',
+                    'who': f'client.{dds[0]}',
+                    'key': 'rgw_run_sync_thread',
+                })
+                assert f == ('false' if disable_sync_traffic else 'true')
+
 
 class TestMonService:
 
@@ -3874,6 +3934,7 @@ class TestMgmtGateway:
                                          http {
 
                                              #access_log /dev/stdout;
+                                             error_log /dev/stderr info;
                                              client_header_buffer_size 32K;
                                              large_client_header_buffers 4 32k;
                                              proxy_busy_buffers_size 512k;
@@ -4121,6 +4182,7 @@ class TestMgmtGateway:
                                          http {
 
                                              #access_log /dev/stdout;
+                                             error_log /dev/stderr info;
                                              client_header_buffer_size 32K;
                                              large_client_header_buffers 4 32k;
                                              proxy_busy_buffers_size 512k;
diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py
index a505801eea5..4fbc975ae9f 100644
--- a/src/pybind/mgr/orchestrator/_interface.py
+++ b/src/pybind/mgr/orchestrator/_interface.py
@@ -747,6 +747,10 @@ class Orchestrator(object):
         """
         raise NotImplementedError()
 
+    def set_osd_spec(self, service_name: str, osd_ids: List[str]) -> OrchResult:
+        """ set service of osd """
+        raise NotImplementedError()
+
     def blink_device_light(self, ident_fault: str, on: bool, locations: List['DeviceLightLoc']) -> OrchResult[List[str]]:
         """
         Instructs the orchestrator to enable or disable either the ident or the fault LED.
diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py
index 332bc75d862..d5a1bb3da2b 100644
--- a/src/pybind/mgr/orchestrator/module.py
+++ b/src/pybind/mgr/orchestrator/module.py
@@ -1472,6 +1472,14 @@ Usage:
 
         return HandleCommandResult(stdout=out)
 
+    @_cli_write_command('orch osd set-spec-affinity')
+    def _osd_set_spec(self, service_name: str, osd_id: List[str]) -> HandleCommandResult:
+        """Set service spec affinity for osd"""
+        completion = self.set_osd_spec(service_name, osd_id)
+        res = raise_if_exception(completion)
+
+        return HandleCommandResult(stdout=res)
+
     @_cli_write_command('orch daemon add')
     def daemon_add_misc(self,
                         daemon_type: Optional[ServiceType] = None,
@@ -1666,7 +1674,13 @@ Usage:
             specs: List[Union[ServiceSpec, HostSpec]] = []
             # YAML '---' document separator with no content generates
             # None entries in the output. Let's skip them silently.
-            content = [o for o in yaml_objs if o is not None]
+            try:
+                content = [o for o in yaml_objs if o is not None]
+            except yaml.scanner.ScannerError as e:
+                msg = f"Invalid YAML received : {str(e)}"
+                self.log.exception(msg)
+                return HandleCommandResult(-errno.EINVAL, stderr=msg)
+
             for s in content:
                 try:
                     spec = json_to_generic_spec(s)
@@ -2191,7 +2205,13 @@ Usage:
             specs: List[TunedProfileSpec] = []
             # YAML '---' document separator with no content generates
             # None entries in the output. Let's skip them silently.
-            content = [o for o in yaml_objs if o is not None]
+            try:
+                content = [o for o in yaml_objs if o is not None]
+            except yaml.scanner.ScannerError as e:
+                msg = f"Invalid YAML received : {str(e)}"
+                self.log.exception(msg)
+                return HandleCommandResult(-errno.EINVAL, stderr=msg)
+
             for spec in content:
                 specs.append(TunedProfileSpec.from_json(spec))
         else:
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index 8a2a38b86ee..1ac9fa49e32 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1231,6 +1231,7 @@ class RGWSpec(ServiceSpec):
                  rgw_bucket_counters_cache: Optional[bool] = False,
                  rgw_bucket_counters_cache_size: Optional[int] = None,
                  generate_cert: bool = False,
+                 disable_multisite_sync_traffic: Optional[bool] = None,
                  ):
         assert service_type == 'rgw', service_type
 
@@ -1283,6 +1284,8 @@ class RGWSpec(ServiceSpec):
         self.rgw_bucket_counters_cache_size = rgw_bucket_counters_cache_size
         #: Whether we should generate a cert/key for the user if not provided
         self.generate_cert = generate_cert
+        #: Used to make RGW not do multisite replication so it can dedicate to IO
+        self.disable_multisite_sync_traffic = disable_multisite_sync_traffic
 
     def get_port_start(self) -> List[int]:
         return [self.get_port()]
@@ -2328,6 +2331,7 @@ class AlertManagerSpec(MonitoringSpec):
                  user_data: Optional[Dict[str, Any]] = None,
                  config: Optional[Dict[str, str]] = None,
                  networks: Optional[List[str]] = None,
+                 only_bind_port_on_networks: bool = False,
                  port: Optional[int] = None,
                  secure: bool = False,
                  extra_container_args: Optional[GeneralArgList] = None,
@@ -2358,6 +2362,7 @@ class AlertManagerSpec(MonitoringSpec):
         #                        <webhook_configs> configuration.
         self.user_data = user_data or {}
         self.secure = secure
+        self.only_bind_port_on_networks = only_bind_port_on_networks
 
     def get_port_start(self) -> List[int]:
         return [self.get_port(), 9094]
@@ -2404,7 +2409,7 @@ class GrafanaSpec(MonitoringSpec):
         self.protocol = protocol
 
         # whether ports daemons for this service bind to should
-        # bind to only hte networks listed in networks param, or
+        # bind to only the networks listed in networks param, or
         # to all networks. Defaults to false which is saying to bind
         # on all networks.
         self.only_bind_port_on_networks = only_bind_port_on_networks
diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt
index 3727c525ce7..41e473e23f0 100644
--- a/src/rgw/CMakeLists.txt
+++ b/src/rgw/CMakeLists.txt
@@ -487,9 +487,9 @@ target_link_libraries(radosgw PRIVATE
 install(TARGETS radosgw DESTINATION bin)
 
 set(radosgw_admin_srcs
-  rgw_admin.cc
-  rgw_sync_checkpoint.cc
-  rgw_orphan.cc)
+  radosgw-admin/radosgw-admin.cc
+  radosgw-admin/sync_checkpoint.cc
+  radosgw-admin/orphan.cc)
 
 # this is unsatisfying and hopefully temporary; ARROW should not be
 # part of radosgw_admin
diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc
index a87d88c4b85..92dd7afe2fb 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.cc
+++ b/src/rgw/driver/daos/rgw_sal_daos.cc
@@ -858,8 +858,6 @@ bool DaosZone::is_writeable() { return true; }
 
 bool DaosZone::get_redirect_endpoint(std::string* endpoint) { return false; }
 
-bool DaosZone::has_zonegroup_api(const std::string& api) const { return false; }
-
 const std::string& DaosZone::get_current_period_id() {
   return current_period->get_id();
 }
diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h
index e382fdb04ae..5515579a441 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.h
+++ b/src/rgw/driver/daos/rgw_sal_daos.h
@@ -484,7 +484,6 @@ class DaosZone : public StoreZone {
   virtual const std::string& get_name() const override;
   virtual bool is_writeable() override;
   virtual bool get_redirect_endpoint(std::string* endpoint) override;
-  virtual bool has_zonegroup_api(const std::string& api) const override;
   virtual const std::string& get_current_period_id() override;
   virtual const RGWAccessKey& get_system_key() {
     return zone_params->system_key;
diff --git a/src/rgw/driver/motr/rgw_sal_motr.cc b/src/rgw/driver/motr/rgw_sal_motr.cc
index b999673ac18..463ea8c5b11 100644
--- a/src/rgw/driver/motr/rgw_sal_motr.cc
+++ b/src/rgw/driver/motr/rgw_sal_motr.cc
@@ -1111,11 +1111,6 @@ bool MotrZone::get_redirect_endpoint(std::string* endpoint)
   return false;
 }
 
-bool MotrZone::has_zonegroup_api(const std::string& api) const
-{
-  return (zonegroup.group.api_name == api);
-}
-
 const std::string& MotrZone::get_current_period_id()
 {
   return current_period->get_id();
diff --git a/src/rgw/driver/motr/rgw_sal_motr.h b/src/rgw/driver/motr/rgw_sal_motr.h
index f92074b9d94..0f99ae48e86 100644
--- a/src/rgw/driver/motr/rgw_sal_motr.h
+++ b/src/rgw/driver/motr/rgw_sal_motr.h
@@ -525,7 +525,6 @@ class MotrZone : public StoreZone {
     virtual const std::string& get_name() const override;
     virtual bool is_writeable() override;
     virtual bool get_redirect_endpoint(std::string* endpoint) override;
-    virtual bool has_zonegroup_api(const std::string& api) const override;
     virtual const std::string& get_current_period_id() override;
     virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; }
     virtual const std::string& get_realm_name() { return realm->get_name(); }
diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc
index 1345468210f..9d76462baa0 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.cc
+++ b/src/rgw/driver/posix/rgw_sal_posix.cc
@@ -2893,6 +2893,14 @@ int POSIXObject::copy_object(const ACLOwner& owner,
   return dobj->set_obj_attrs(dpp, &attrs, nullptr, y, rgw::sal::FLAG_LOG_OP);
 }
 
+int POSIXObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			    int max_parts, int marker, int* next_marker,
+			    bool* truncated, list_parts_each_t each_func,
+			    optional_yield y)
+{
+  return -EOPNOTSUPP;
+}
+
 int POSIXObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh)
 {
   int ret = stat(dpp);
diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h
index 8ec72bbc1bc..bf3478ad6ab 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.h
+++ b/src/rgw/driver/posix/rgw_sal_posix.h
@@ -653,6 +653,13 @@ public:
                const DoutPrefixProvider* dpp, optional_yield y) override;
   virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
   virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
+
+  /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int max_parts, int marker, int* next_marker,
+			 bool* truncated, list_parts_each_t each_func,
+			 optional_yield y) override;
+
   virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override;
   virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs,
 			    Attrs* delattrs, optional_yield y, uint32_t flags) override;
diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc
index f18e8e46bc5..aacb9b6a09a 100644
--- a/src/rgw/driver/rados/rgw_period.cc
+++ b/src/rgw/driver/rados/rgw_period.cc
@@ -68,20 +68,6 @@ int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
   return ret;
 }
 
-int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y)
-{
-  if (zonegroup.realm_id != realm_id) {
-    return 0;
-  }
-  int ret = period_map.update(zonegroup, cct);
-  if (ret < 0) {
-    ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
-    return ret;
-  }
-
-  return store_info(dpp, false, y);
-}
-
 int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y)
 {
   auto zone_svc = sysobj_svc->get_zone_svc();
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index 0b77bca1da7..69075c506f1 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -6962,13 +6962,13 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu
   }
 
   return 0;
-}
+} /* RGWRados::set_attrs() */
 
-static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y,
-                              RGWRados* store, RGWBucketInfo& bucket_info,
-                              RGWObjectCtx* rctx, RGWObjManifest* manifest,
-                              int part_num, int* parts_count, bool prefetch,
-                              RGWObjState** pstate, RGWObjManifest** pmanifest)
+int RGWRados::get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y,
+				 RGWRados* store, RGWBucketInfo& bucket_info,
+				 RGWObjectCtx* rctx, RGWObjManifest* manifest,
+				 int part_num, int* parts_count, bool prefetch,
+				 RGWObjState** pstate, RGWObjManifest** pmanifest)
 {
   if (!manifest) {
     return -ERR_INVALID_PART;
@@ -7047,6 +7047,9 @@ static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y,
 
   // update the object size
   sm->state.size = part_manifest.get_obj_size();
+  if (!sm->state.attrset.count(RGW_ATTR_COMPRESSION)) {
+    sm->state.accounted_size = sm->state.size;
+  }
 
   *pmanifest = &part_manifest;
   return 0;
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index b24823b60dc..fe79916392f 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -1071,6 +1071,12 @@ public:
     }; // class RGWRados::Bucket::List
   }; // class RGWRados::Bucket
 
+  static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y,
+		       RGWRados* store, RGWBucketInfo& bucket_info,
+		       RGWObjectCtx* rctx, RGWObjManifest* manifest,
+		       int part_num, int* parts_count, bool prefetch,
+		       RGWObjState** pstate, RGWObjManifest** pmanifest);
+
   int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
                                RGWBucketInfo& bucket_info,
                                const std::string& obj_prefix,
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
index 88da446c3de..4c67d0ee71a 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.cc
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -2471,7 +2471,108 @@ bool RadosObject::is_sync_completed(const DoutPrefixProvider* dpp,
 
   const rgw_bi_log_entry& earliest_marker = entries.front();
   return earliest_marker.timestamp > obj_mtime;
-}
+} /* is_sync_completed */
+
+int RadosObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			   int max_parts, int marker, int* next_marker,
+			   bool* truncated, list_parts_each_t each_func,
+			   optional_yield y)
+{
+  int ret{0};
+
+  /* require an object with a manifest, so call to get_obj_state() must precede this */
+  if (! manifest) {
+    return -EINVAL;
+  }
+
+  RGWObjManifest::obj_iterator end = manifest->obj_end(dpp);
+  if (end.get_cur_part_id() == 0) { // not multipart
+    ldpp_dout(dpp, 20) << __func__ << " object does not have a multipart manifest"
+		       << dendl;
+    return 0;
+  }
+
+  auto end_part_id = end.get_cur_part_id();
+  auto parts_count = (end_part_id == 1) ? 1 : end_part_id - 1;
+  if (marker > (parts_count - 1)) {
+    return 0;
+  }
+
+  RGWObjManifest::obj_iterator part_iter = manifest->obj_begin(dpp);
+
+  if (marker != 0) {
+    ldpp_dout_fmt(dpp, 20,
+		  "{} seeking to part #{} in the object manifest",
+		  __func__, marker);
+
+    part_iter  = manifest->obj_find_part(dpp, marker + 1);
+
+    if (part_iter == end) {
+      ldpp_dout_fmt(dpp, 5,
+		    "{} failed to find part #{} in the object manifest",
+		    __func__, marker + 1);
+      return 0;
+    }
+  }
+
+  RGWObjectCtx& obj_ctx = get_ctx();
+  RGWBucketInfo& bucket_info = get_bucket()->get_info();
+
+  Object::Part obj_part{};
+  for (; part_iter != manifest->obj_end(dpp); ++part_iter) {
+
+    /* we're only interested in the first object in each logical part */
+    auto cur_part_id = part_iter.get_cur_part_id();
+    if (cur_part_id == obj_part.part_number) {
+      continue;
+    }
+
+    if (max_parts < 1) {
+      *truncated = true;
+      break;
+    }
+
+    /* get_part_obj_state alters the passed manifest** to point to a part
+     * manifest, which we don't want to leak out here */
+    RGWObjManifest* obj_m = manifest;
+    RGWObjState* astate;
+    bool part_prefetch = false;
+    ret = RGWRados::get_part_obj_state(dpp, y, store->getRados(), bucket_info, &obj_ctx,
+				       obj_m, cur_part_id, &parts_count,
+				       part_prefetch, &astate, &obj_m);
+
+    if (ret < 0) {
+      ldpp_dout_fmt(dpp, 4,
+		    "{} get_part_obj_state() failed ret={}",
+		    __func__, ret);
+      break;
+    }
+
+    obj_part.part_number = part_iter.get_cur_part_id();
+    obj_part.part_size = astate->accounted_size;
+
+    if (auto iter = astate->attrset.find(RGW_ATTR_CKSUM);
+	iter != astate->attrset.end()) {
+          try {
+	    rgw::cksum::Cksum part_cksum;
+	    auto ck_iter = iter->second.cbegin();
+	    part_cksum.decode(ck_iter);
+	    obj_part.cksum = std::move(part_cksum);
+	  } catch (buffer::error& err) {
+	    ldpp_dout_fmt(dpp, 4,
+			  "WARN: {} could not decode stored cksum, "
+			  "caught buffer::error",
+			  __func__);
+	  }
+    }
+
+    each_func(obj_part);
+    *next_marker = ++marker;
+    --max_parts;
+  } /* each part */
+  
+  return ret;
+} /* RadosObject::list_parts */
 
 int RadosObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh)
 {
@@ -4500,11 +4601,6 @@ bool RadosZone::get_redirect_endpoint(std::string* endpoint)
   return true;
 }
 
-bool RadosZone::has_zonegroup_api(const std::string& api) const
-{
-  return store->svc()->zone->has_zonegroup_api(api);
-}
-
 const std::string& RadosZone::get_current_period_id()
 {
   return store->svc()->zone->get_current_period_id();
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
index 23d81a934b0..85ea247e345 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.h
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -107,7 +107,6 @@ class RadosZone : public StoreZone {
     virtual const std::string& get_name() const override;
     virtual bool is_writeable() override;
     virtual bool get_redirect_endpoint(std::string* endpoint) override;
-    virtual bool has_zonegroup_api(const std::string& api) const override;
     virtual const std::string& get_current_period_id() override;
     virtual const RGWAccessKey& get_system_key() override;
     virtual const std::string& get_realm_name() override;
@@ -593,12 +592,18 @@ class RadosObject : public StoreObject {
       StoreObject::set_compressed();
     }
 
-
     virtual bool is_sync_completed(const DoutPrefixProvider* dpp,
       const ceph::real_time& obj_mtime) override;
     /* For rgw_admin.cc */
     RGWObjState& get_state() { return state; }
     virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override;
+
+    /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */
+    virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			   int max_parts, int marker, int* next_marker,
+			   bool* truncated, list_parts_each_t each_func,
+			   optional_yield y) override;
+
     virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override;
     virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
     virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc
index 79d2be0bcfa..bf7a309e864 100644
--- a/src/rgw/driver/rados/rgw_tools.cc
+++ b/src/rgw/driver/rados/rgw_tools.cc
@@ -339,21 +339,35 @@ int rgw_list_pool(const DoutPrefixProvider *dpp,
     ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl;
     return -EINVAL;
   }
-
-  auto iter = ioctx.nobjects_begin(oc);
+  librados::NObjectIterator iter;
+  try {
+    iter = ioctx.nobjects_begin(oc);
+  } catch (const std::system_error& e) {
+    ldpp_dout(dpp, 1) << "rgw_list_pool: Failed to begin iteration of pool "
+		      << ioctx.get_pool_name() << " with error "
+		      << e.what() << dendl;
+    return ceph::from_error_code(e.code());
+  }
   /// Pool_iterate
   if (iter == ioctx.nobjects_end())
     return -ENOENT;
 
-  for (; oids->size() < max && iter != ioctx.nobjects_end(); ++iter) {
-    string oid = iter->get_oid();
-    ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
+  try {
+    for (; oids->size() < max && iter != ioctx.nobjects_end(); ++iter) {
+      string oid = iter->get_oid();
+      ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
 
-    // fill it in with initial values; we may correct later
-    if (filter && !filter(oid, oid))
-      continue;
+      // fill it in with initial values; we may correct later
+      if (filter && !filter(oid, oid))
+	continue;
 
-    oids->push_back(oid);
+      oids->push_back(oid);
+    }
+  } catch (const std::system_error& e) {
+    ldpp_dout(dpp, 1) << "rgw_list_pool: Failed iterating pool "
+		      << ioctx.get_pool_name() << " with error "
+		      << e.what() << dendl;
+    return ceph::from_error_code(e.code());
   }
 
   marker = iter.get_cursor().to_str();
diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h
index ab157f38e39..4ae7d13eff7 100644
--- a/src/rgw/driver/rados/rgw_user.h
+++ b/src/rgw/driver/rados/rgw_user.h
@@ -19,11 +19,11 @@
 
 #define RGW_USER_ANON_ID "anonymous"
 
-#define SECRET_KEY_LEN 40
-#define PUBLIC_ID_LEN 20
-#define RAND_SUBUSER_LEN 5
+constexpr auto SECRET_KEY_LEN=40;
+constexpr auto PUBLIC_ID_LEN=20;
+constexpr auto RAND_SUBUSER_LEN=5;
 
-#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/"
+constexpr auto XMLNS_AWS_S3 = "http://s3.amazonaws.com/doc/2006-03-01/";
 
 class RGWUserCtl;
 class RGWBucketCtl;
diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h
index c542abc76d6..5fb2b4b8096 100644
--- a/src/rgw/driver/rados/rgw_zone.h
+++ b/src/rgw/driver/rados/rgw_zone.h
@@ -769,7 +769,6 @@ public:
   int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
   int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
   int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
-  int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y);
 
   void fork();
   int update(const DoutPrefixProvider *dpp, optional_yield y);
diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/radosgw-admin/orphan.cc
index b7dc562c721..9fca3b99a7c 100644
--- a/src/rgw/rgw_orphan.cc
+++ b/src/rgw/radosgw-admin/orphan.cc
@@ -1,6 +1,12 @@
+
+/*
+ * Copyright (C) 2024 IBM 
+*/
+
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab ft=cpp
 
+#include "radosgw-admin/orphan.h"
 #include <string>
 
 
@@ -10,7 +16,6 @@
 
 #include "rgw_op.h"
 #include "rgw_multi.h"
-#include "rgw_orphan.h"
 #include "rgw_zone.h"
 #include "rgw_bucket.h"
 #include "rgw_sal_rados.h"
diff --git a/src/rgw/rgw_orphan.h b/src/rgw/radosgw-admin/orphan.h
index db811d31d9a..db811d31d9a 100644
--- a/src/rgw/rgw_orphan.h
+++ b/src/rgw/radosgw-admin/orphan.h
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/radosgw-admin/radosgw-admin.cc
index f2c1e81b540..182e42b8e31 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/radosgw-admin/radosgw-admin.cc
@@ -1,12 +1,15 @@
+/*
+ * Copyright (C) 2025 IBM 
+*/
+
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab ft=cpp
 
-#include <errno.h>
-#include <iostream>
-#include <sstream>
+#include <cerrno>
 #include <string>
-
-#include <boost/optional.hpp>
+#include <sstream>
+#include <optional>
+#include <iostream>
 
 extern "C" {
 #include <liboath/oath.h>
@@ -38,6 +41,9 @@ extern "C" {
 #include "include/utime.h"
 #include "include/str_list.h"
 
+#include "radosgw-admin/orphan.h"
+#include "radosgw-admin/sync_checkpoint.h"
+
 #include "rgw_user.h"
 #include "rgw_otp.h"
 #include "rgw_rados.h"
@@ -48,7 +54,6 @@ extern "C" {
 #include "rgw_log.h"
 #include "rgw_formats.h"
 #include "rgw_usage.h"
-#include "rgw_orphan.h"
 #include "rgw_sync.h"
 #include "rgw_trim_bilog.h"
 #include "rgw_trim_datalog.h"
@@ -62,7 +67,6 @@ extern "C" {
 #include "rgw_zone.h"
 #include "rgw_pubsub.h"
 #include "rgw_bucket_sync.h"
-#include "rgw_sync_checkpoint.h"
 #include "rgw_lua.h"
 #include "rgw_sal.h"
 #include "rgw_sal_config.h"
@@ -82,11 +86,6 @@ extern "C" {
 
 #define dout_context g_ceph_context
 
-#define SECRET_KEY_LEN 40
-#define PUBLIC_ID_LEN 20
-
-using namespace std;
-
 static rgw::sal::Driver* driver = NULL;
 static constexpr auto dout_subsys = ceph_subsys_rgw;
 
@@ -117,19 +116,13 @@ static const DoutPrefixProvider* dpp() {
     } \
   } while (0)
 
-static inline int posix_errortrans(int r)
+using namespace std;
+
+inline int posix_errortrans(int r)
 {
-  switch(r) {
-  case ERR_NO_SUCH_BUCKET:
-    r = ENOENT;
-    break;
-  default:
-    break;
-  }
-  return r;
+ return ERR_NO_SUCH_BUCKET == r ? ENOENT : r;
 }
 
-
 static const std::string LUA_CONTEXT_LIST("prerequest, postrequest, background, getdata, putdata");
 
 void usage()
@@ -1272,7 +1265,7 @@ static int read_input(const string& infile, bufferlist& bl)
     }
   }
 
-#define READ_CHUNK 8196
+  constexpr auto READ_CHUNK=8196;
   int r;
   int err;
 
diff --git a/src/rgw/rgw_sync_checkpoint.cc b/src/rgw/radosgw-admin/sync_checkpoint.cc
index 1172e79a48f..0303ed6c747 100644
--- a/src/rgw/rgw_sync_checkpoint.cc
+++ b/src/rgw/radosgw-admin/sync_checkpoint.cc
@@ -5,6 +5,7 @@
  * Ceph - scalable distributed file system
  *
  * Copyright (C) 2020 Red Hat, Inc.
+ * Copyright (C) 2024 IBM
  *
  * This is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -13,9 +14,12 @@
  *
  */
 
+#include "radosgw-admin/sync_checkpoint.h"
+
 #include <fmt/format.h>
+
 #include "common/errno.h"
-#include "rgw_sync_checkpoint.h"
+
 #include "rgw_sal_rados.h"
 #include "rgw_bucket_sync.h"
 #include "rgw_data_sync.h"
diff --git a/src/rgw/rgw_sync_checkpoint.h b/src/rgw/radosgw-admin/sync_checkpoint.h
index 28df68d8860..28df68d8860 100644
--- a/src/rgw/rgw_sync_checkpoint.h
+++ b/src/rgw/radosgw-admin/sync_checkpoint.h
diff --git a/src/rgw/rgw_cksum_pipe.cc b/src/rgw/rgw_cksum_pipe.cc
index e06957e2715..0bec8d341af 100644
--- a/src/rgw/rgw_cksum_pipe.cc
+++ b/src/rgw/rgw_cksum_pipe.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <fmt/format.h>
 #include <boost/algorithm/string.hpp>
+#include "rgw_cksum.h"
 #include "rgw_common.h"
 #include "common/dout.h"
 #include "rgw_client_io.h"
@@ -34,7 +35,8 @@ namespace rgw::putobj {
   {}
 
   std::unique_ptr<RGWPutObj_Cksum> RGWPutObj_Cksum::Factory(
-    rgw::sal::DataProcessor* next, const RGWEnv& env)
+    rgw::sal::DataProcessor* next, const RGWEnv& env,
+    rgw::cksum::Type override_type)
   {
     /* look for matching headers */
     auto algo_header = cksum_algorithm_hdr(env);
@@ -49,6 +51,13 @@ namespace rgw::putobj {
       throw rgw::io::Exception(EINVAL, std::system_category());
     }
     /* no checksum header */
+    if (override_type != rgw::cksum::Type::none) {
+      /* XXXX safe? do we need to fixup env as well? */
+      auto algo_header = cksum_algorithm_hdr(override_type);
+      return
+	std::make_unique<RGWPutObj_Cksum>(
+			   next, override_type, std::move(algo_header));
+    }
     return std::unique_ptr<RGWPutObj_Cksum>();
   }
 
diff --git a/src/rgw/rgw_cksum_pipe.h b/src/rgw/rgw_cksum_pipe.h
index fddcd283c84..c459d156335 100644
--- a/src/rgw/rgw_cksum_pipe.h
+++ b/src/rgw/rgw_cksum_pipe.h
@@ -20,6 +20,7 @@
 #include <tuple>
 #include <cstring>
 #include <boost/algorithm/string/case_conv.hpp>
+#include "rgw_cksum.h"
 #include "rgw_cksum_digest.h"
 #include "rgw_common.h"
 #include "rgw_putobj.h"
@@ -29,6 +30,38 @@ namespace rgw::putobj {
   namespace cksum = rgw::cksum;
   using cksum_hdr_t = std::pair<const char*, const char*>;
 
+  static inline const cksum_hdr_t cksum_algorithm_hdr(rgw::cksum::Type t) {
+    static constexpr std::string_view hdr =
+      "HTTP_X_AMZ_SDK_CHECKSUM_ALGORITHM";
+    using rgw::cksum::Type;
+    switch (t) {
+    case Type::sha256:
+      return cksum_hdr_t(hdr.data(), "SHA256");
+      break;
+    case Type::crc32:
+      return cksum_hdr_t(hdr.data(), "CRC32");
+      break;
+    case Type::crc32c:
+      return cksum_hdr_t(hdr.data(), "CRC32C");
+      break;
+    case Type::xxh3:
+      return cksum_hdr_t(hdr.data(), "XX3");
+      break;
+    case Type::sha1:
+      return cksum_hdr_t(hdr.data(), "SHA1");
+      break;
+    case Type::sha512:
+      return cksum_hdr_t(hdr.data(), "SHA512");
+      break;
+    case Type::blake3:
+      return cksum_hdr_t(hdr.data(), "BLAKE3");
+      break;
+    default:
+      break;
+    };
+    return cksum_hdr_t(nullptr, nullptr);;
+  }
+
   static inline const cksum_hdr_t cksum_algorithm_hdr(const RGWEnv& env) {
     /* If the individual checksum value you provide through
        x-amz-checksum-algorithm doesn't match the checksum algorithm
@@ -102,7 +135,8 @@ namespace rgw::putobj {
     using VerifyResult = std::tuple<bool, const cksum::Cksum&>;
 
     static std::unique_ptr<RGWPutObj_Cksum> Factory(
-      rgw::sal::DataProcessor* next, const RGWEnv&);
+      rgw::sal::DataProcessor* next, const RGWEnv&,
+      rgw::cksum::Type override_type);
 
     RGWPutObj_Cksum(rgw::sal::DataProcessor* next, rgw::cksum::Type _type,
 		    cksum_hdr_t&& _hdr);
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 97c46f12bd0..6610538542c 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -63,6 +63,7 @@ rgw_http_errors rgw_http_s3_errors({
     { ERR_INVALID_DIGEST, {400, "InvalidDigest" }},
     { ERR_BAD_DIGEST, {400, "BadDigest" }},
     { ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }},
+    { ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION, {400, "IllegalLocationConstraintException" }},
     { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }},
     { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }},
     { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }},
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index f0bd41494c3..d2917838f36 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -337,6 +337,7 @@ inline constexpr const char* RGW_REST_STS_XMLNS =
 #define ERR_PRESIGNED_URL_EXPIRED			 2223
 #define ERR_PRESIGNED_URL_DISABLED     2224
 #define ERR_AUTHORIZATION        2225 // SNS 403 AuthorizationError
+#define ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION 2226
 
 #define ERR_BUSY_RESHARDING      2300 // also in cls_rgw_types.h, don't change!
 #define ERR_NO_SUCH_ENTITY       2301
diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc
index 2a5c9cd313e..ef6761d4222 100644
--- a/src/rgw/rgw_iam_policy.cc
+++ b/src/rgw/rgw_iam_policy.cc
@@ -94,6 +94,8 @@ static const actpair actpairs[] =
  { "s3:GetPublicAccessBlock", s3GetPublicAccessBlock },
  { "s3:GetObjectAcl", s3GetObjectAcl },
  { "s3:GetObject", s3GetObject },
+ { "s3:GetObjectAttributes", s3GetObjectAttributes },
+ { "s3:GetObjectVersionAttributes", s3GetObjectVersionAttributes },
  { "s3:GetObjectTorrent", s3GetObjectTorrent },
  { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl },
  { "s3:GetObjectVersion", s3GetObjectVersion },
@@ -1335,6 +1337,7 @@ const char* action_bit_string(uint64_t action) {
 
   case s3ListBucketVersions:
     return "s3:ListBucketVersions";
+
   case s3ListAllMyBuckets:
     return "s3:ListAllMyBuckets";
 
@@ -1479,6 +1482,12 @@ const char* action_bit_string(uint64_t action) {
   case s3BypassGovernanceRetention:
     return "s3:BypassGovernanceRetention";
 
+  case s3GetObjectAttributes:
+    return "s3:GetObjectAttributes";
+
+  case s3GetObjectVersionAttributes:
+    return "s3:GetObjectVersionAttributes";
+
   case s3DescribeJob:
     return "s3:DescribeJob";
 
diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h
index 0476926143f..dd323ee4b9c 100644
--- a/src/rgw/rgw_iam_policy.h
+++ b/src/rgw/rgw_iam_policy.h
@@ -115,6 +115,8 @@ enum {
   s3GetBucketEncryption,
   s3PutBucketEncryption,
   s3DescribeJob,
+  s3GetObjectAttributes,
+  s3GetObjectVersionAttributes,
   s3All,
 
   s3objectlambdaGetObject,
@@ -247,6 +249,8 @@ inline int op_to_perm(std::uint64_t op) {
   case s3GetObjectVersionTagging:
   case s3GetObjectRetention:
   case s3GetObjectLegalHold:
+  case s3GetObjectAttributes:
+  case s3GetObjectVersionAttributes:
   case s3ListAllMyBuckets:
   case s3ListBucket:
   case s3ListBucketMultipartUploads:
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 5ce0033de47..7b0ca3134a3 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -25,8 +25,10 @@
 #include "common/ceph_json.h"
 #include "common/static_ptr.h"
 #include "common/perf_counters_key.h"
+#include "rgw_cksum.h"
 #include "rgw_cksum_digest.h"
 #include "rgw_common.h"
+#include "common/split.h"
 #include "rgw_tracer.h"
 
 #include "rgw_rados.h"
@@ -3558,54 +3560,62 @@ void RGWCreateBucket::execute(optional_yield y)
   const rgw::SiteConfig& site = *s->penv.site;
   const std::optional<RGWPeriod>& period = site.get_period();
   const RGWZoneGroup& my_zonegroup = site.get_zonegroup();
-
-  if (s->system_request) {
-    // allow system requests to override the target zonegroup. for forwarded
-    // requests, we'll create the bucket for the originating zonegroup
-    createparams.zonegroup_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup");
-  }
-
+  const std::string rgwx_zonegroup = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup");
   const RGWZoneGroup* bucket_zonegroup = &my_zonegroup;
-  if (createparams.zonegroup_id.empty()) {
-    // default to the local zonegroup
-    createparams.zonegroup_id = my_zonegroup.id;
-  } else if (period) {
-    auto z = period->period_map.zonegroups.find(createparams.zonegroup_id);
-    if (z == period->period_map.zonegroups.end()) {
-      ldpp_dout(this, 0) << "could not find zonegroup "
-          << createparams.zonegroup_id << " in current period" << dendl;
-      op_ret = -ENOENT;
-      return;
-    }
-    bucket_zonegroup = &z->second;
-  } else if (createparams.zonegroup_id != my_zonegroup.id) {
-    ldpp_dout(this, 0) << "zonegroup does not match current zonegroup "
-        << createparams.zonegroup_id << dendl;
-    op_ret = -ENOENT;
-    return;
-  }
 
-  // validate the LocationConstraint
+  // Validate LocationConstraint if it's provided and enforcement is strict
   if (!location_constraint.empty() && !relaxed_region_enforcement) {
-    // on the master zonegroup, allow any valid api_name. otherwise it has to
-    // match the bucket's zonegroup
-    if (period && my_zonegroup.is_master) {
-      if (!period->period_map.zonegroups_by_api.count(location_constraint)) {
+    if (period) {
+      auto location_iter = period->period_map.zonegroups_by_api.find(location_constraint);
+      if (location_iter == period->period_map.zonegroups_by_api.end()) {
         ldpp_dout(this, 0) << "location constraint (" << location_constraint
             << ") can't be found." << dendl;
         op_ret = -ERR_INVALID_LOCATION_CONSTRAINT;
-        s->err.message = "The specified location-constraint is not valid";
+        s->err.message = fmt::format("The {} location constraint is not valid.",
+                                     location_constraint);
         return;
       }
-    } else if (bucket_zonegroup->api_name != location_constraint) {
+      bucket_zonegroup = &location_iter->second;
+    } else if (location_constraint != my_zonegroup.api_name) { // if we don't have a period, we can only use the current zonegroup - so check if the location matches by api name here
       ldpp_dout(this, 0) << "location constraint (" << location_constraint
-          << ") doesn't match zonegroup (" << bucket_zonegroup->api_name
-          << ')' << dendl;
-      op_ret = -ERR_INVALID_LOCATION_CONSTRAINT;
-      s->err.message = "The specified location-constraint is not valid";
+          << ") doesn't match zonegroup (" << my_zonegroup.api_name << ")" << dendl;
+      op_ret = -ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION;
+      s->err.message = fmt::format("The {} location constraint is incompatible "
+                                   "for the region specific endpoint this request was sent to.",
+                                   location_constraint);
       return;
     }
   }
+  // If it's a system request, use the provided zonegroup if available
+  else if (s->system_request && !rgwx_zonegroup.empty()) {
+    if (period) {
+      auto zonegroup_iter = period->period_map.zonegroups.find(rgwx_zonegroup);
+      if (zonegroup_iter == period->period_map.zonegroups.end()) {
+        ldpp_dout(this, 0) << "could not find zonegroup " << rgwx_zonegroup
+            << " in current period" << dendl;
+        op_ret = -ENOENT;
+        return;
+      }
+      bucket_zonegroup = &zonegroup_iter->second;
+    }
+  }
+
+  const bool enforce_location_match =
+    !period ||               // No period: no multisite, so no need to enforce location match.
+    !s->system_request ||    // All user requests are enforced to match zonegroup's location.
+    !my_zonegroup.is_master; // but if it's a system request (forwarded) only allow remote creation on master zonegroup.
+  if (enforce_location_match && !my_zonegroup.equals(bucket_zonegroup->get_id())) {
+    ldpp_dout(this, 0) << "location constraint (" << bucket_zonegroup->api_name
+        << ") doesn't match zonegroup (" << my_zonegroup.api_name << ")" << dendl;
+    op_ret = -ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION;
+    s->err.message = fmt::format("The {} location constraint is incompatible "
+                                 "for the region specific endpoint this request was sent to.",
+                                 bucket_zonegroup->api_name);
+    return;
+  }
+
+  // Set the final zonegroup ID
+  createparams.zonegroup_id = bucket_zonegroup->id;
 
   // select and validate the placement target
   op_ret = select_bucket_placement(this, *bucket_zonegroup, s->user->get_info(),
@@ -3614,7 +3624,7 @@ void RGWCreateBucket::execute(optional_yield y)
     return;
   }
 
-  if (bucket_zonegroup == &my_zonegroup) {
+  if (my_zonegroup.equals(bucket_zonegroup->get_id())) {
     // look up the zone placement pool
     createparams.zone_placement = rgw::find_zone_placement(
         this, site.get_zone_params(), createparams.placement_rule);
@@ -3703,7 +3713,6 @@ void RGWCreateBucket::execute(optional_yield y)
 
   if (!driver->is_meta_master()) {
     // apply bucket creation on the master zone first
-    bufferlist in_data;
     JSONParser jp;
     op_ret = rgw_forward_request_to_master(this, *s->penv.site, s->owner.id,
                                            &in_data, &jp, s->info, y);
@@ -3780,7 +3789,10 @@ void RGWCreateBucket::execute(optional_yield y)
       s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty();
 
       /* This will also set the quota on the bucket. */
-      op_ret = s->bucket->merge_and_store_attrs(this, createparams.attrs, y);
+      s->bucket->set_attrs(std::move(createparams.attrs));
+      constexpr bool exclusive = false; // overwrite
+      constexpr ceph::real_time no_set_mtime{};
+      op_ret = s->bucket->put_info(this, exclusive, no_set_mtime, y);
     } while (op_ret == -ECANCELED && tries++ < 20);
 
     /* Restore the proper return code. */
@@ -4331,6 +4343,9 @@ void RGWPutObj::execute(optional_yield y)
       }
       return;
     }
+
+    multipart_cksum_type = upload->cksum_type;
+
     /* upload will go out of scope, so copy the dest placement for later use */
     s->dest_placement = *pdest_placement;
     pdest_placement = &s->dest_placement;
@@ -4461,11 +4476,12 @@ void RGWPutObj::execute(optional_yield y)
     /* optional streaming checksum */
     try {
       cksum_filter =
-	rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env);
+	rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env, multipart_cksum_type);
     } catch (const rgw::io::Exception& e) {
       op_ret = -e.code().value();
       return;
     }
+
     if (cksum_filter) {
       filter = &*cksum_filter;
     }
@@ -4612,10 +4628,12 @@ void RGWPutObj::execute(optional_yield y)
 
   if (cksum_filter) {
     const auto& hdr = cksum_filter->header();
+    auto expected_ck = cksum_filter->expected(*s->info.env);
     auto cksum_verify =
       cksum_filter->verify(*s->info.env); // valid or no supplied cksum
     cksum = get<1>(cksum_verify);
-    if (std::get<0>(cksum_verify)) {
+    if ((!expected_ck) ||
+	std::get<0>(cksum_verify)) {
       buffer::list cksum_bl;
 
       ldpp_dout_fmt(this, 16,
@@ -4623,14 +4641,13 @@ void RGWPutObj::execute(optional_yield y)
 		    "\n\tcomputed={} == \n\texpected={}",
 		    hdr.second,
 		    cksum->to_armor(),
-		    cksum_filter->expected(*s->info.env));
+		    (!!expected_ck) ? expected_ck : "(checksum unavailable)");
 
       cksum->encode(cksum_bl);
       emplace_attr(RGW_ATTR_CKSUM, std::move(cksum_bl));
     } else {
       /* content checksum mismatch */
       auto computed_ck = cksum->to_armor();
-      auto expected_ck = cksum_filter->expected(*s->info.env);
 
       ldpp_dout_fmt(this, 4,
 		    "{} content checksum mismatch"
@@ -4833,7 +4850,8 @@ void RGWPostObj::execute(optional_yield y)
     /* optional streaming checksum */
     try {
       cksum_filter =
-	rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env);
+	rgw::putobj::RGWPutObj_Cksum::Factory(
+	  filter, *s->info.env, rgw::cksum::Type::none /* no override */);
     } catch (const rgw::io::Exception& e) {
       op_ret = -e.code().value();
       return;
@@ -5181,7 +5199,10 @@ void RGWPutMetadataBucket::execute(optional_yield y)
       /* Setting attributes also stores the provided bucket info. Due
        * to this fact, the new quota settings can be serialized with
        * the same call. */
-      op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+      s->bucket->set_attrs(attrs);
+      constexpr bool exclusive = false; // overwrite
+      constexpr ceph::real_time no_set_mtime{};
+      op_ret = s->bucket->put_info(this, exclusive, no_set_mtime, s->yield);
       return op_ret;
     }, y);
 }
@@ -5969,8 +5990,6 @@ void RGWGetACLs::execute(optional_yield y)
   acls = ss.str();
 }
 
-
-
 int RGWPutACLs::verify_permission(optional_yield y)
 {
   bool perm;
@@ -5992,6 +6011,74 @@ int RGWPutACLs::verify_permission(optional_yield y)
   return 0;
 }
 
+uint16_t RGWGetObjAttrs::recognize_attrs(const std::string& hdr, uint16_t deflt)
+{
+  auto attrs{deflt};
+  auto sa = ceph::split(hdr, ",");
+  for (auto& k : sa) {
+    if (boost::iequals(k, "etag")) {
+      attrs |= as_flag(ReqAttributes::Etag);
+    }
+    if (boost::iequals(k, "checksum")) {
+      attrs |= as_flag(ReqAttributes::Checksum);
+    }
+    if (boost::iequals(k, "objectparts")) {
+      attrs |= as_flag(ReqAttributes::ObjectParts);
+    }
+    if (boost::iequals(k, "objectsize")) {
+      attrs |= as_flag(ReqAttributes::ObjectSize);
+    }
+    if (boost::iequals(k, "storageclass")) {
+      attrs |= as_flag(ReqAttributes::StorageClass);
+    }
+  }
+  return attrs;
+} /* RGWGetObjAttrs::recognize_attrs */
+
+int RGWGetObjAttrs::verify_permission(optional_yield y)
+{
+  bool perm = false;
+  auto [has_s3_existing_tag, has_s3_resource_tag] =
+    rgw_check_policy_condition(this, s);
+
+  if (! rgw::sal::Object::empty(s->object.get())) {
+
+    auto iam_action1 = s->object->get_instance().empty() ?
+      rgw::IAM::s3GetObject :
+      rgw::IAM::s3GetObjectVersion;
+
+    auto iam_action2 = s->object->get_instance().empty() ?
+      rgw::IAM::s3GetObjectAttributes :
+      rgw::IAM::s3GetObjectVersionAttributes;
+
+    if (has_s3_existing_tag || has_s3_resource_tag) {
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+    }
+
+    /* XXXX the following conjunction should be &&--but iam_action2 is currently not
+     * hooked up and always fails (but should succeed if the requestor has READ
+     * acess to the object) */
+    perm = (verify_object_permission(this, s, iam_action1) || /* && */
+	    verify_object_permission(this, s, iam_action2));
+  }
+
+  if (! perm) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetObjAttrs::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetObjAttrs::execute(optional_yield y)
+{
+  RGWGetObj::execute(y);
+} /* RGWGetObjAttrs::execute */
+
 int RGWGetLC::verify_permission(optional_yield y)
 {
   auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
@@ -6659,6 +6746,14 @@ try_sum_part_cksums(const DoutPrefixProvider *dpp,
     ++parts_ix;
     auto& part_cksum = part.second->get_cksum();
 
+    if (! part_cksum) {
+      ldpp_dout_fmt(dpp, 0,
+		    "ERROR: multipart part checksum not present (ix=={})",
+		    parts_ix);
+      op_ret = -ERR_INVALID_REQUEST;
+      return op_ret;
+    }
+
     ldpp_dout_fmt(dpp, 16,
 		  "INFO: {} iterate part: {} {} {}",
 		  __func__, parts_ix, part_cksum->type_string(),
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 9f747501729..dcf64c31572 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -12,6 +12,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <limits.h>
 
 #include <array>
@@ -1111,6 +1112,7 @@ class RGWCreateBucket : public RGWOp {
   bool relaxed_region_enforcement = false;
   RGWCORSConfiguration cors_config;
   std::set<std::string> rmattr_names;
+  bufferlist in_data;
 
   virtual bool need_metadata_upload() const { return false; }
 
@@ -1237,6 +1239,7 @@ protected:
   std::string multipart_upload_id;
   std::string multipart_part_str;
   int multipart_part_num = 0;
+  rgw::cksum::Type multipart_cksum_type{rgw::cksum::Type::none};
   jspan_ptr multipart_trace;
 
   boost::optional<ceph::real_time> delete_at;
@@ -1644,6 +1647,50 @@ public:
   uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
 };
 
+class RGWGetObjAttrs : public RGWGetObj {
+protected:
+  std::string version_id;
+  std::string expected_bucket_owner;
+  std::optional<int> marker;
+  std::optional<int> max_parts;
+  uint16_t requested_attributes{0};
+#if 0
+  /* used to decrypt attributes for objects stored with SSE-C */
+  x-amz-server-side-encryption-customer-algorithm
+  x-amz-server-side-encryption-customer-key
+  x-amz-server-side-encryption-customer-key-MD5
+#endif
+public:
+
+  enum class ReqAttributes : uint16_t {
+    None = 0,
+    Etag,
+    Checksum,
+    ObjectParts,
+    StorageClass,
+    ObjectSize
+  };
+
+  static uint16_t as_flag(ReqAttributes attr) {
+    return 1 << (uint16_t(attr) ? uint16_t(attr) - 1 : 0);
+  }
+
+  static uint16_t recognize_attrs(const std::string& hdr, uint16_t deflt = 0);
+
+  RGWGetObjAttrs() : RGWGetObj()
+  {
+    RGWGetObj::get_data = false; // it's extra false
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  void send_response() override = 0;
+  const char* name() const override { return "get_obj_attrs"; }
+  RGWOpType get_type() override { return RGW_OP_GET_OBJ_ATTRS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+}; /* RGWGetObjAttrs */
+
 class RGWGetLC : public RGWOp {
 protected:
 
diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h
index 49faea6403d..2c8225d289e 100644
--- a/src/rgw/rgw_op_type.h
+++ b/src/rgw/rgw_op_type.h
@@ -30,6 +30,7 @@ enum RGWOpType {
   RGW_OP_COPY_OBJ,
   RGW_OP_GET_ACLS,
   RGW_OP_PUT_ACLS,
+  RGW_OP_GET_OBJ_ATTRS,
   RGW_OP_GET_CORS,
   RGW_OP_PUT_CORS,
   RGW_OP_DELETE_CORS,
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index aa33080af56..9111696453e 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -403,6 +403,17 @@ public:
   virtual std::string canonical_name() const override { return fmt::format("REST.{}.ACL", s->info.method); }
 };
 
+class RGWGetObjAttrs_ObjStore : public RGWGetObjAttrs {
+public:
+  RGWGetObjAttrs_ObjStore() {}
+  ~RGWGetObjAttrs_ObjStore() override {}
+
+  int get_params(optional_yield y) = 0;
+  /* not actually used */
+  int send_response_data_error(optional_yield y) override { return 0; };
+  int send_response_data(bufferlist& bl, off_t ofs, off_t len) override { return 0; };
+};
+
 class RGWGetLC_ObjStore : public RGWGetLC {
 public:
   RGWGetLC_ObjStore() {}
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 30ebe8e8965..9edb79d8fd0 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -9,6 +9,7 @@
 #include <string_view>
 
 #include "common/ceph_crypto.h"
+#include "common/dout.h"
 #include "common/split.h"
 #include "common/Formatter.h"
 #include "common/utf8.h"
@@ -807,7 +808,6 @@ void RGWGetObjTags_ObjStore_S3::send_response_data(bufferlist& bl)
   }
 }
 
-
 int RGWPutObjTags_ObjStore_S3::get_params(optional_yield y)
 {
   RGWXMLParser parser;
@@ -2533,6 +2533,10 @@ int RGWCreateBucket_ObjStore_S3::get_params(optional_yield y)
   if ((op_ret < 0) && (op_ret != -ERR_LENGTH_REQUIRED))
     return op_ret;
 
+  if (!driver->is_meta_master()) {
+    in_data.append(data);
+  }
+
   if (data.length()) {
     RGWCreateBucketParser parser;
 
@@ -3815,6 +3819,196 @@ void RGWPutACLs_ObjStore_S3::send_response()
   dump_start(s);
 }
 
+int RGWGetObjAttrs_ObjStore_S3::get_params(optional_yield y)
+{
+  string err;
+  auto& env = s->info.env;
+  version_id = s->info.args.get("versionId");
+
+  auto hdr = env->get_optional("HTTP_X_AMZ_EXPECTED_BUCKET_OWNER");
+  if (hdr) {
+    expected_bucket_owner = *hdr;
+  }
+
+  hdr = env->get_optional("HTTP_X_AMZ_MAX_PARTS");
+  if (hdr) {
+    max_parts = strict_strtol(hdr->c_str(), 10, &err);
+    if (!err.empty()) {
+      s->err.message = "Invalid value for MaxParts: " + err;
+      ldpp_dout(s, 10) << "Invalid value for MaxParts " << *hdr << ": "
+		       << err << dendl;
+      return -ERR_INVALID_PART;
+    }
+    max_parts = std::min(*max_parts, 1000);
+  }
+
+  hdr = env->get_optional("HTTP_X_AMZ_PART_NUMBER_MARKER");
+  if (hdr) {
+    marker = strict_strtol(hdr->c_str(), 10, &err);
+    if (!err.empty()) {
+      s->err.message = "Invalid value for PartNumberMarker: " + err;
+      ldpp_dout(s, 10) << "Invalid value for PartNumberMarker " << *hdr << ": "
+		       << err << dendl;
+      return -ERR_INVALID_PART;
+    }
+  }
+
+  hdr = env->get_optional("HTTP_X_AMZ_OBJECT_ATTRIBUTES");
+  if (hdr) {
+    requested_attributes = recognize_attrs(*hdr);
+  }
+
+  /* XXX skipping SSE-C params for now */
+
+  return 0;
+} /* RGWGetObjAttrs_ObjStore_S3::get_params(...) */
+
+int RGWGetObjAttrs_ObjStore_S3::get_decrypt_filter(
+    std::unique_ptr<RGWGetObj_Filter> *filter,
+    RGWGetObj_Filter* cb, bufferlist* manifest_bl)
+{
+  // we aren't actually decrypting the data, but for objects encrypted with
+  // SSE-C we do need to verify that required headers are present and valid
+  //
+  // in the SSE-KMS and SSE-S3 cases, this unfortunately causes us to fetch
+  // decryption keys which we don't need :(
+  std::unique_ptr<BlockCrypt> block_crypt; // ignored
+  std::map<std::string, std::string> crypt_http_responses; // ignored
+  return rgw_s3_prepare_decrypt(s, s->yield, attrs, &block_crypt,
+                                crypt_http_responses);
+}
+
+void RGWGetObjAttrs_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  if (op_ret == 0) {
+    version_id = s->object->get_instance();
+
+    // x-amz-delete-marker: DeleteMarker // not sure we can plausibly do this?
+    dump_last_modified(s, lastmod);
+    dump_header_if_nonempty(s, "x-amz-version-id", version_id);
+    // x-amz-request-charged: RequestCharged
+  }
+
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("GetObjectAttributes");
+    if (requested_attributes & as_flag(ReqAttributes::Etag)) {
+      if (lo_etag.empty()) {
+       auto iter = attrs.find(RGW_ATTR_ETAG);
+       if (iter != attrs.end()) {
+	 lo_etag = iter->second.to_str();
+       }
+      }
+      s->formatter->dump_string("ETag", lo_etag);
+    }
+
+    if (requested_attributes & as_flag(ReqAttributes::Checksum)) {
+      s->formatter->open_object_section("Checksum");
+      auto iter = attrs.find(RGW_ATTR_CKSUM);
+      if (iter != attrs.end()) {
+	try {
+	  rgw::cksum::Cksum cksum;
+	  auto bliter = iter->second.cbegin();
+	  cksum.decode(bliter);
+          if (multipart_parts_count && multipart_parts_count > 0) {
+	    s->formatter->dump_string(cksum.element_name(),
+		fmt::format("{}-{}", cksum.to_armor(), *multipart_parts_count));
+	  } else {
+	    s->formatter->dump_string(cksum.element_name(), cksum.to_armor());
+	  }
+	} catch (buffer::error& err) {
+	  ldpp_dout(this, 0)
+	    << "ERROR: could not decode stored cksum, caught buffer::error" << dendl;
+	}
+      }
+      s->formatter->close_section(); /* Checksum */
+    } /* Checksum */
+
+    if (requested_attributes & as_flag(ReqAttributes::ObjectParts)) {
+      if (multipart_parts_count && multipart_parts_count > 0) {
+
+	/* XXX the following was needed to see a manifest at list_parts()! */
+	op_ret = s->object->load_obj_state(s, s->yield);
+	if (op_ret < 0) {
+	  ldpp_dout_fmt(this, 0,
+			"ERROR: {} load_obj_state() failed ret={}", __func__,
+			op_ret);
+	}
+
+	ldpp_dout_fmt(this, 16,
+		      "{} attr flags={} parts_count={}",
+		      __func__, requested_attributes, *multipart_parts_count);
+
+        s->formatter->open_object_section("ObjectParts");
+
+	bool truncated = false;
+	int next_marker;
+
+	using namespace rgw::sal;
+
+	int ret =
+	  s->object->list_parts(
+            this, s->cct,
+	    max_parts ? *max_parts : 1000,
+	    marker ? *marker : 0,
+	    &next_marker, &truncated,
+	    [&](const Object::Part& part) -> int {
+	      s->formatter->open_object_section("Part");
+	      s->formatter->dump_int("PartNumber", part.part_number);
+	      s->formatter->dump_unsigned("Size", part.part_size);
+	      if (part.cksum.type != rgw::cksum::Type::none) {
+		s->formatter->dump_string(part.cksum.element_name(), part.cksum.to_armor());
+	      }
+	      s->formatter->close_section(); /* Part */
+	      return 0;
+	    }, s->yield);
+
+	if (ret < 0) {
+	  ldpp_dout_fmt(this, 0,
+			"ERROR: {} list-parts failed for {}",
+			__func__, s->object->get_name());
+	}
+	/* AWS docs disagree on the name of this element */
+	s->formatter->dump_int("PartsCount", *multipart_parts_count);
+	s->formatter->dump_int("TotalPartsCount", *multipart_parts_count);
+	s->formatter->dump_bool("IsTruncated", truncated);
+	if (max_parts) {
+	  s->formatter->dump_int("MaxParts", *max_parts);
+	}
+	if(truncated) {
+	  s->formatter->dump_int("NextPartNumberMarker", next_marker);
+	}
+	if (marker) {
+	  s->formatter->dump_int("PartNumberMarker", *marker);
+	}
+	s->formatter->close_section();
+      } /* multipart_parts_count positive */
+    } /* ObjectParts */
+
+    if (requested_attributes & as_flag(ReqAttributes::ObjectSize)) {
+      s->formatter->dump_int("ObjectSize", s->obj_size);
+    }
+
+    if (requested_attributes & as_flag(ReqAttributes::StorageClass)) {
+      auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
+      if (iter != attrs.end()) {
+	s->formatter->dump_string("StorageClass", iter->second.to_str());
+      } else {
+	s->formatter->dump_string("StorageClass", "STANDARD");
+      }
+    }
+    s->formatter->close_section();
+  } /* op_ret == 0 */
+
+  rgw_flush_formatter_and_reset(s, s->formatter);
+} /* RGWGetObjAttrs_ObjStore_S3::send_response */
+
 void RGWGetLC_ObjStore_S3::execute(optional_yield y)
 {
   config.set_ctx(s->cct);
@@ -4794,6 +4988,7 @@ RGWOp *RGWHandler_REST_Bucket_S3::get_obj_op(bool get_data) const
 
 RGWOp *RGWHandler_REST_Bucket_S3::op_get()
 {
+  /* XXX maybe we could replace this with an indexing operation */
   if (s->info.args.sub_resource_exists("encryption"))
     return nullptr;
 
@@ -4990,6 +5185,8 @@ RGWOp *RGWHandler_REST_Obj_S3::op_get()
     return new RGWGetObjLayout_ObjStore_S3;
   } else if (is_tagging_op()) {
     return new RGWGetObjTags_ObjStore_S3;
+  } else if (is_attributes_op()) {
+    return new RGWGetObjAttrs_ObjStore_S3;
   } else if (is_obj_retention_op()) {
     return new RGWGetObjRetention_ObjStore_S3;
   } else if (is_obj_legal_hold_op()) {
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index 50160d79a42..e8fdc69751c 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -374,6 +374,18 @@ public:
   int get_params(optional_yield y) override;
 };
 
+class RGWGetObjAttrs_ObjStore_S3 : public RGWGetObjAttrs_ObjStore {
+public:
+  RGWGetObjAttrs_ObjStore_S3() {}
+  ~RGWGetObjAttrs_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter,
+                         RGWGetObj_Filter* cb,
+                         bufferlist* manifest_bl) override;
+  void send_response() override;
+};
+
 class RGWGetLC_ObjStore_S3 : public RGWGetLC_ObjStore {
 protected:
   RGWLifecycleConfiguration_S3 config;
@@ -701,6 +713,9 @@ protected:
   bool is_acl_op() const {
     return s->info.args.exists("acl");
   }
+  bool is_attributes_op() const {
+    return s->info.args.exists("attributes");
+  }
   bool is_cors_op() const {
       return s->info.args.exists("cors");
   }
@@ -759,6 +774,9 @@ protected:
   bool is_acl_op() const {
     return s->info.args.exists("acl");
   }
+  bool is_attributes_op() const {
+    return s->info.args.exists("attributes");
+  }
   bool is_tagging_op() const {
     return s->info.args.exists("tagging");
   }
diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h
index e098c4decf7..4b94f74b851 100644
--- a/src/rgw/rgw_sal.h
+++ b/src/rgw/rgw_sal.h
@@ -15,6 +15,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <optional>
 #include <boost/intrusive_ptr.hpp>
 #include <boost/smart_ptr/intrusive_ref_counter.hpp>
@@ -26,6 +27,7 @@
 #include "rgw_notify_event_type.h"
 #include "rgw_req_context.h"
 #include "include/random.h"
+#include "include/function2.hpp"
 
 // FIXME: following subclass dependencies
 #include "driver/rados/rgw_user.h"
@@ -1169,6 +1171,9 @@ class Object {
                std::string* version_id, std::string* tag, std::string* etag,
                void (*progress_cb)(off_t, void *), void* progress_data,
                const DoutPrefixProvider* dpp, optional_yield y) = 0;
+
+    /** return logging subsystem */
+    virtual unsigned get_subsys() { return ceph_subsys_rgw; };
     /** Get the ACL for this object */
     virtual RGWAccessControlPolicy& get_acl(void) = 0;
     /** Set the ACL for this object */
@@ -1249,6 +1254,28 @@ class Object {
     /** Dump driver-specific object layout info in JSON */
     virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) = 0;
 
+  /* A transfer data type describing metadata specific to one part of a
+   * completed multipart upload object, following the GetObjectAttributes
+   * response syntax for Object::Parts here:
+   * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObjectAttributes.html */
+    class Part
+    {
+    public:
+      int part_number;
+      uint32_t part_size;
+      rgw::cksum::Cksum cksum;
+    }; /* Part */
+
+    /* callback function/object used by list_parts */
+    using list_parts_each_t =
+      const fu2::unique_function<int(const Part&) const>;
+  
+    /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */
+    virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			   int max_parts, int marker, int* next_marker,
+			   bool* truncated, list_parts_each_t each_func,
+			   optional_yield y) = 0;
+
     /** Get the cached attributes for this object */
     virtual Attrs& get_attrs(void) = 0;
     /** Get the (const) cached attributes for this object */
@@ -1447,7 +1474,7 @@ public:
   virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) = 0;
   /** List all the parts of this upload, filling the parts cache */
   virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
-			 int num_parts, int marker,
+			 int max_parts, int marker,
 			 int* next_marker, bool* truncated, optional_yield y,
 			 bool assume_unsorted = false) = 0;
   /** Abort this upload */
@@ -1751,8 +1778,6 @@ class Zone {
     virtual bool is_writeable() = 0;
     /** Get the URL for the endpoint for redirecting to this zone */
     virtual bool get_redirect_endpoint(std::string* endpoint) = 0;
-    /** Check to see if the given API is supported in this zone */
-    virtual bool has_zonegroup_api(const std::string& api) const = 0;
     /** Get the current period ID for this zone */
     virtual const std::string& get_current_period_id() = 0;
     /** Get thes system access key for this zone */
diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc
index 0e4f95846d1..02fd7a49cda 100644
--- a/src/rgw/rgw_sal_dbstore.cc
+++ b/src/rgw/rgw_sal_dbstore.cc
@@ -458,14 +458,6 @@ namespace rgw::sal {
     return false;
   }
 
-  bool DBZone::has_zonegroup_api(const std::string& api) const
-  {
-    if (api == "default")
-      return true;
-
-    return false;
-  }
-
   const std::string& DBZone::get_current_period_id()
   {
     return current_period->get_id();
@@ -496,6 +488,14 @@ namespace rgw::sal {
     return std::make_unique<DBLuaManager>(this);
   }
 
+  int DBObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			   int max_parts, int marker, int* next_marker,
+			   bool* truncated, list_parts_each_t each_func,
+			   optional_yield y)
+  {
+    return -EOPNOTSUPP;
+  }
+
   int DBObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh)
   {
     RGWObjState* astate;
diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h
index b54249df031..4df10d1dce1 100644
--- a/src/rgw/rgw_sal_dbstore.h
+++ b/src/rgw/rgw_sal_dbstore.h
@@ -303,7 +303,6 @@ protected:
       virtual const std::string& get_name() const override;
       virtual bool is_writeable() override;
       virtual bool get_redirect_endpoint(std::string* endpoint) override;
-      virtual bool has_zonegroup_api(const std::string& api) const override;
       virtual const std::string& get_current_period_id() override;
       virtual const RGWAccessKey& get_system_key() override;
       virtual const std::string& get_realm_name() override;
@@ -529,6 +528,7 @@ protected:
 
       DBObject(DBObject& _o) = default;
 
+      virtual unsigned get_subsys() { return ceph_subsys_rgw_dbstore; };
       virtual int delete_object(const DoutPrefixProvider* dpp,
           optional_yield y,
           uint32_t flags,
@@ -554,6 +554,13 @@ protected:
       virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
 
       virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override;
+
+      /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */
+      virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			   int max_parts, int marker, int* next_marker,
+			   bool* truncated, list_parts_each_t each_func,
+			   optional_yield y) override;
+
       virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override;
       virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
       virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc
index 733bfa39ee2..15da580988e 100644
--- a/src/rgw/rgw_sal_filter.cc
+++ b/src/rgw/rgw_sal_filter.cc
@@ -1046,6 +1046,17 @@ RGWAccessControlPolicy& FilterObject::get_acl()
   return next->get_acl();
 }
 
+int FilterObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			     int max_parts, int marker, int* next_marker,
+			     bool* truncated, list_parts_each_t each_func,
+			     optional_yield y)
+{
+  return next->list_parts(dpp, cct, max_parts, marker, next_marker,
+			  truncated,
+			  sal::Object::list_parts_each_t(each_func),
+			  y);
+}
+
 int FilterObject::load_obj_state(const DoutPrefixProvider *dpp,
                                  optional_yield y, bool follow_olh) {
   return next->load_obj_state(dpp, y, follow_olh);
diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h
index 43a440e8b10..947ce9d4bf5 100644
--- a/src/rgw/rgw_sal_filter.h
+++ b/src/rgw/rgw_sal_filter.h
@@ -108,9 +108,6 @@ public:
   virtual bool get_redirect_endpoint(std::string* endpoint) override {
       return next->get_redirect_endpoint(endpoint);
   }
-  virtual bool has_zonegroup_api(const std::string& api) const override {
-      return next->has_zonegroup_api(api);
-  }
   virtual const std::string& get_current_period_id() override {
       return next->get_current_period_id();
   }
@@ -781,6 +778,12 @@ public:
   virtual bool empty() const override { return next->empty(); }
   virtual const std::string &get_name() const override { return next->get_name(); }
 
+  /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int max_parts, int marker, int* next_marker,
+			 bool* truncated, list_parts_each_t each_func,
+			 optional_yield y) override;
+
   virtual int load_obj_state(const DoutPrefixProvider *dpp, optional_yield y,
                              bool follow_olh = true) override;
   virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs,
diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc
index 61360135dd9..97d81550058 100644
--- a/src/rgw/services/svc_zone.cc
+++ b/src/rgw/services/svc_zone.cc
@@ -657,18 +657,6 @@ const string& RGWSI_Zone::get_current_period_id() const
   return current_period->get_id();
 }
 
-bool RGWSI_Zone::has_zonegroup_api(const std::string& api) const
-{
-  if (!current_period->get_id().empty()) {
-    const auto& zonegroups_by_api = current_period->get_map().zonegroups_by_api;
-    if (zonegroups_by_api.find(api) != zonegroups_by_api.end())
-      return true;
-  } else if (zonegroup->api_name == api) {
-    return true;
-  }
-  return false;
-}
-
 bool RGWSI_Zone::zone_is_writeable()
 {
   return writeable_zone && !get_zone().is_read_only();
diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h
index 1b3f49587a3..719546eb8db 100644
--- a/src/rgw/services/svc_zone.h
+++ b/src/rgw/services/svc_zone.h
@@ -96,7 +96,6 @@ public:
   uint32_t get_zone_short_id() const;
 
   const std::string& get_current_period_id() const;
-  bool has_zonegroup_api(const std::string& api) const;
 
   bool zone_is_writeable();
   bool zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const;
diff --git a/src/script/run-make.sh b/src/script/run-make.sh
index 52d43d3a171..23724028fe6 100755
--- a/src/script/run-make.sh
+++ b/src/script/run-make.sh
@@ -29,6 +29,7 @@ function clean_up_after_myself() {
 
 function detect_ceph_dev_pkgs() {
     local boost_root=/opt/ceph
+    local cmake_opts=""
     if test -f $boost_root/include/boost/config.hpp; then
         cmake_opts+=" -DWITH_SYSTEM_BOOST=ON -DBOOST_ROOT=$boost_root"
     else
diff --git a/src/test/ObjectMap/KeyValueDBMemory.cc b/src/test/ObjectMap/KeyValueDBMemory.cc
index 234e963397e..cfe25930d6a 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.cc
+++ b/src/test/ObjectMap/KeyValueDBMemory.cc
@@ -132,12 +132,26 @@ public:
       return "";
   }
 
+  string_view key_as_sv() override {
+    if (valid())
+      return (*it).first.second;
+    else
+      return "";
+  }
+
   pair<string,string> raw_key() override {
     if (valid())
       return (*it).first;
     else
       return make_pair("", "");
   }
+
+  pair<string_view,string_view> raw_key_as_sv() override {
+    if (valid())
+      return (*it).first;
+    else
+      return make_pair("", "");
+  }
   
   bool raw_key_is_prefixed(const string &prefix) override {
     return prefix == (*it).first.first;
@@ -150,6 +164,13 @@ public:
       return bufferlist();
   }
 
+  std::string_view value_as_sv() override {
+    if (valid())
+      return std::string_view{it->second.c_str(), it->second.length()};
+    else
+      return std::string_view();
+  }
+
   int status() override {
     return 0;
   }
diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc
index 8b1f7435c87..7874411e0ff 100644
--- a/src/test/crimson/seastore/test_btree_lba_manager.cc
+++ b/src/test/crimson/seastore/test_btree_lba_manager.cc
@@ -157,7 +157,10 @@ struct btree_test_base :
     }).safe_then([this] {
       return seastar::do_with(
 	cache->create_transaction(
-            Transaction::src_t::MUTATE, "test_set_up_fut", false),
+	  Transaction::src_t::MUTATE,
+	  "test_set_up_fut",
+	  CACHE_HINT_TOUCH,
+	  false),
 	[this](auto &ref_t) {
 	  return with_trans_intr(*ref_t, [&](auto &t) {
 	    cache->init();
@@ -236,7 +239,10 @@ struct lba_btree_test : btree_test_base {
   template <typename F>
   auto lba_btree_update(F &&f) {
     auto tref = cache->create_transaction(
-        Transaction::src_t::MUTATE, "test_btree_update", false);
+      Transaction::src_t::MUTATE,
+      "test_btree_update",
+      CACHE_HINT_TOUCH,
+      false);
     auto &t = *tref;
     with_trans_intr(
       t,
@@ -281,7 +287,10 @@ struct lba_btree_test : btree_test_base {
   template <typename F>
   auto lba_btree_read(F &&f) {
     auto t = cache->create_transaction(
-        Transaction::src_t::READ, "test_btree_read", false);
+      Transaction::src_t::READ,
+      "test_btree_read",
+      CACHE_HINT_TOUCH,
+      false);
     return with_trans_intr(
       *t,
       [this, f=std::forward<F>(f)](auto &t) mutable {
@@ -429,7 +438,10 @@ struct btree_lba_manager_test : btree_test_base {
   auto create_transaction(bool create_fake_extent=true) {
     auto t = test_transaction_t{
       cache->create_transaction(
-          Transaction::src_t::MUTATE, "test_mutate_lba", false),
+	Transaction::src_t::MUTATE,
+	"test_mutate_lba",
+	CACHE_HINT_TOUCH,
+	false),
       test_lba_mappings
     };
     if (create_fake_extent) {
@@ -445,7 +457,10 @@ struct btree_lba_manager_test : btree_test_base {
   auto create_weak_transaction() {
     auto t = test_transaction_t{
       cache->create_transaction(
-          Transaction::src_t::READ, "test_read_weak", true),
+	Transaction::src_t::READ,
+	"test_read_weak",
+	CACHE_HINT_TOUCH,
+	true),
       test_lba_mappings
     };
     return t;
diff --git a/src/test/crimson/seastore/test_seastore_cache.cc b/src/test/crimson/seastore/test_seastore_cache.cc
index 6e24f436b98..fa774886139 100644
--- a/src/test/crimson/seastore/test_seastore_cache.cc
+++ b/src/test/crimson/seastore/test_seastore_cache.cc
@@ -87,7 +87,10 @@ struct cache_test_t : public seastar_test_suite_t {
 
   auto get_transaction() {
     return cache->create_transaction(
-        Transaction::src_t::MUTATE, "test_cache", false);
+      Transaction::src_t::MUTATE,
+      "test_cache",
+      CACHE_HINT_TOUCH,
+      false);
   }
 
   template <typename T, typename... Args>
diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc
index 7e058c80ed6..e0fc5821d08 100644
--- a/src/test/crimson/test_backfill.cc
+++ b/src/test/crimson/test_backfill.cc
@@ -119,6 +119,11 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
     events_to_dispatch.emplace_back(event.intrusive_from_this());
   }
 
+  template <class EventT>
+  void schedule_event_immediate(const EventT& event) {
+    events_to_dispatch.emplace_front(event.intrusive_from_this());
+  }
+
   // BackfillListener {
   void request_replica_scan(
     const pg_shard_t& target,
@@ -188,12 +193,11 @@ public:
   struct PGFacade;
 
   void cancel() {
-    events_to_dispatch.clear();
-    schedule_event(crimson::osd::BackfillState::CancelBackfill{});
+    schedule_event_immediate(crimson::osd::BackfillState::CancelBackfill{});
   }
 
   void resume() {
-    schedule_event(crimson::osd::BackfillState::Triggered{});
+    schedule_event_immediate(crimson::osd::BackfillState::Triggered{});
   }
 };
 
@@ -274,6 +278,9 @@ struct BackfillFixture::PGFacade : public crimson::osd::BackfillState::PGFacade
     return backfill_source.projected_log;
   }
 
+  std::ostream &print(std::ostream &out) const override {
+    return out << "FakePGFacade";
+  }
 };
 
 BackfillFixture::BackfillFixture(
@@ -452,7 +459,69 @@ TEST(backfill, two_empty_replicas)
   EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
 }
 
-TEST(backfill, cancel_resume)
+TEST(backfill, cancel_resume_middle_of_primaryscan)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 1 */ }
+  ).add_target(
+    { /* nothing 2 */ }
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.cancel();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
+  cluster_fixture.resume();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
+TEST(backfill, cancel_resume_middle_of_replicascan1)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 1 */ }
+  ).add_target(
+    { /* nothing 2 */ }
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
+  cluster_fixture.cancel();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.resume();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
+TEST(backfill, cancel_resume_middle_of_replicascan2)
 {
   const auto reference_store = FakeStore{ {
     { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
@@ -469,12 +538,43 @@ TEST(backfill, cancel_resume)
 
   EXPECT_CALL(cluster_fixture, backfilled);
   cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.cancel();
   cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
+TEST(backfill, cancel_resume_middle_of_push1)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 1 */ }
+  ).add_target(
+    { /* nothing 2 */ }
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.cancel();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
+  cluster_fixture.resume();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
@@ -483,7 +583,7 @@ TEST(backfill, cancel_resume)
   EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
 }
 
-TEST(backfill, cancel_resume_middle_of_scan)
+TEST(backfill, cancel_resume_middle_of_push2)
 {
   const auto reference_store = FakeStore{ {
     { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
@@ -501,14 +601,46 @@ TEST(backfill, cancel_resume_middle_of_scan)
   EXPECT_CALL(cluster_fixture, backfilled);
   cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.cancel();
   cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
   cluster_fixture.resume();
   cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.next_till_done();
+
+  EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
+}
+
+TEST(backfill, cancel_resume_middle_of_push3)
+{
+  const auto reference_store = FakeStore{ {
+    { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} },
+    { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} },
+    { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} },
+  }};
+  auto cluster_fixture = BackfillFixtureBuilder::add_source(
+    reference_store.objs
+  ).add_target(
+    { /* nothing 1 */ }
+  ).add_target(
+    { /* nothing 2 */ }
+  ).get_result();
+
+  EXPECT_CALL(cluster_fixture, backfilled);
+  cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.cancel();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
   cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>();
+  cluster_fixture.resume();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>();
+  cluster_fixture.next_round2<crimson::osd::BackfillState::RequestDone>();
   cluster_fixture.next_till_done();
 
   EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store));
diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc
index 68587fe87d1..7fb90bdd38e 100644
--- a/src/test/librados/aio.cc
+++ b/src/test/librados/aio.cc
@@ -1722,3 +1722,59 @@ TEST(LibRadosAioEC, MultiWrite) {
   rados_aio_release(my_completion2);
   rados_aio_release(my_completion3);
 }
+
+TEST(LibRadosAio, CancelBeforeSubmit) {
+  AioTestData test_data;
+  ASSERT_EQ("", test_data.init());
+
+  rados_completion_t completion;
+  ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion));
+
+  ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion));
+  rados_aio_release(completion);
+}
+
+TEST(LibRadosAio, CancelBeforeComplete) {
+  AioTestData test_data;
+  ASSERT_EQ("", test_data.init());
+
+  // cancellation tests are racy, so retry if completion beats the cancellation
+  int ret = 0;
+  int tries = 10;
+  do {
+    rados_completion_t completion;
+    ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion));
+    char buf[128];
+    ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
+                                completion, buf, sizeof(buf), 0));
+
+    ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion));
+    {
+      TestAlarm alarm;
+      ASSERT_EQ(0, rados_aio_wait_for_complete(completion));
+    }
+    ret = rados_aio_get_return_value(completion);
+    rados_aio_release(completion);
+  } while (ret == -ENOENT && --tries);
+
+  ASSERT_EQ(-ECANCELED, ret);
+}
+
+TEST(LibRadosAio, CancelAfterComplete) {
+  AioTestData test_data;
+  rados_completion_t completion;
+  ASSERT_EQ("", test_data.init());
+
+  ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion));
+  char buf[128];
+  ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent",
+                              completion, buf, sizeof(buf), 0));
+
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rados_aio_wait_for_complete(completion));
+  }
+  ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion));
+  ASSERT_EQ(-ENOENT, rados_aio_get_return_value(completion));
+  rados_aio_release(completion);
+}
diff --git a/src/test/librados/aio_cxx.cc b/src/test/librados/aio_cxx.cc
index a70af050d70..5e35869b5c2 100644
--- a/src/test/librados/aio_cxx.cc
+++ b/src/test/librados/aio_cxx.cc
@@ -2467,3 +2467,92 @@ TEST(LibRadosAio, MultiReads) {
     ASSERT_EQ(0, memcmp(buf, bl.c_str(), sizeof(buf)));
   }
 }
+
+// cancellation test fixture for global setup/teardown
+// parameterized to test both IoCtx::aio_cancel() and AioCompletion::cancel()
+class Cancel : public ::testing::TestWithParam<bool> {
+  static constexpr auto pool_prefix = "ceph_test_rados_api_pp";
+  static Rados rados;
+  static std::string pool_name;
+ protected:
+  static IoCtx ioctx;
+ public:
+  static void SetUpTestCase() {
+    pool_name = get_temp_pool_name(pool_prefix);
+    ASSERT_EQ("", create_one_pool_pp(pool_name, rados));
+    ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx));
+  }
+  static void TearDownTestCase() {
+    destroy_one_pool_pp(pool_name, rados);
+  }
+};
+Rados Cancel::rados;
+std::string Cancel::pool_name;
+IoCtx Cancel::ioctx;
+
+TEST_P(Cancel, BeforeSubmit)
+{
+  const bool use_completion = GetParam();
+
+  auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()};
+  if (use_completion) {
+    ASSERT_EQ(0, c->cancel());
+  } else  {
+    ASSERT_EQ(0, ioctx.aio_cancel(c.get()));
+  }
+}
+
+TEST_P(Cancel, BeforeComplete)
+{
+  const bool use_completion = GetParam();
+
+  // cancellation tests are racy, so retry if completion beats the cancellation
+  int ret = 0;
+  int tries = 10;
+  do {
+    auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()};
+    ObjectReadOperation op;
+    op.assert_exists();
+    ioctx.aio_operate("nonexistent", c.get(), &op, nullptr);
+
+    if (use_completion) {
+      EXPECT_EQ(0, c->cancel());
+    } else  {
+      EXPECT_EQ(0, ioctx.aio_cancel(c.get()));
+    }
+    {
+      TestAlarm alarm;
+      ASSERT_EQ(0, c->wait_for_complete());
+    }
+    ret = c->get_return_value();
+  } while (ret == -ENOENT && --tries);
+
+  EXPECT_EQ(-ECANCELED, ret);
+}
+
+TEST_P(Cancel, AfterComplete)
+{
+  const bool use_completion = GetParam();
+
+  auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()};
+  ObjectReadOperation op;
+  op.assert_exists();
+  ioctx.aio_operate("nonexistent", c.get(), &op, nullptr);
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, c->wait_for_complete());
+  }
+  if (use_completion) {
+    EXPECT_EQ(0, c->cancel());
+  } else {
+    EXPECT_EQ(0, ioctx.aio_cancel(c.get()));
+  }
+  EXPECT_EQ(-ENOENT, c->get_return_value());
+}
+
+std::string cancel_test_name(const testing::TestParamInfo<Cancel::ParamType>& info)
+{
+  return info.param ? "cancel" : "aio_cancel";
+}
+
+INSTANTIATE_TEST_SUITE_P(LibRadosAio, Cancel, testing::Bool(), cancel_test_name);
diff --git a/src/test/librados/asio.cc b/src/test/librados/asio.cc
index 01ebb957150..500f36508a7 100644
--- a/src/test/librados/asio.cc
+++ b/src/test/librados/asio.cc
@@ -21,10 +21,14 @@
 
 #include <boost/range/begin.hpp>
 #include <boost/range/end.hpp>
+#include <boost/asio/bind_cancellation_slot.hpp>
+#include <boost/asio/cancellation_signal.hpp>
 #include <boost/asio/io_context.hpp>
 #include <boost/asio/spawn.hpp>
 #include <boost/asio/use_future.hpp>
 
+#include <optional>
+
 #define dout_subsys ceph_subsys_rados
 #define dout_context g_ceph_context
 
@@ -78,6 +82,15 @@ void rethrow(std::exception_ptr eptr) {
   if (eptr) std::rethrow_exception(eptr);
 }
 
+auto capture(std::optional<error_code>& out) {
+  return [&out] (error_code ec, ...) { out = ec; };
+}
+
+auto capture(boost::asio::cancellation_signal& signal,
+             std::optional<error_code>& out) {
+  return boost::asio::bind_cancellation_slot(signal.slot(), capture(out));
+}
+
 TEST_F(AsioRados, AsyncReadCallback)
 {
   boost::asio::io_context service;
@@ -385,6 +398,130 @@ TEST_F(AsioRados, AsyncWriteOperationYield)
   service.run();
 }
 
+// FIXME: this crashes on windows with:
+// Thread 1 received signal SIGILL, Illegal instruction.
+#ifndef _WIN32
+
+TEST_F(AsioRados, AsyncReadOperationCancelTerminal)
+{
+  // cancellation tests are racy, so retry if completion beats the cancellation
+  boost::system::error_code ec;
+  int tries = 10;
+  do {
+    boost::asio::io_context service;
+    boost::asio::cancellation_signal signal;
+    std::optional<error_code> result;
+
+    librados::ObjectReadOperation op;
+    op.assert_exists();
+    librados::async_operate(service, io, "noexist", &op, 0, nullptr,
+                            capture(signal, result));
+
+    service.poll();
+    EXPECT_FALSE(service.stopped());
+    EXPECT_FALSE(result);
+
+    signal.emit(boost::asio::cancellation_type::terminal);
+
+    service.run();
+    ASSERT_TRUE(result);
+    ec = *result;
+
+    signal.emit(boost::asio::cancellation_type::all); // noop
+  } while (ec == std::errc::no_such_file_or_directory && --tries);
+
+  EXPECT_EQ(ec, boost::asio::error::operation_aborted);
+}
+
+TEST_F(AsioRados, AsyncReadOperationCancelTotal)
+{
+  // cancellation tests are racy, so retry if completion beats the cancellation
+  boost::system::error_code ec;
+  int tries = 10;
+  do {
+    boost::asio::io_context service;
+    boost::asio::cancellation_signal signal;
+    std::optional<error_code> result;
+
+    librados::ObjectReadOperation op;
+    op.assert_exists();
+    librados::async_operate(service, io, "noexist", &op, 0, nullptr,
+                            capture(signal, result));
+
+    service.poll();
+    EXPECT_FALSE(service.stopped());
+    EXPECT_FALSE(result);
+
+    signal.emit(boost::asio::cancellation_type::total);
+
+    service.run();
+    ASSERT_TRUE(result);
+    ec = *result;
+
+    signal.emit(boost::asio::cancellation_type::all); // noop
+  } while (ec == std::errc::no_such_file_or_directory && --tries);
+
+  EXPECT_EQ(ec, boost::asio::error::operation_aborted);
+}
+
+TEST_F(AsioRados, AsyncWriteOperationCancelTerminal)
+{
+  // cancellation tests are racy, so retry if completion beats the cancellation
+  boost::system::error_code ec;
+  int tries = 10;
+  do {
+    boost::asio::io_context service;
+    boost::asio::cancellation_signal signal;
+    std::optional<error_code> result;
+
+    librados::ObjectWriteOperation op;
+    op.assert_exists();
+    librados::async_operate(service, io, "noexist", &op, 0, nullptr,
+                            capture(signal, result));
+
+    service.poll();
+    EXPECT_FALSE(service.stopped());
+    EXPECT_FALSE(result);
+
+    signal.emit(boost::asio::cancellation_type::terminal);
+
+    service.run();
+    ASSERT_TRUE(result);
+    ec = *result;
+
+    signal.emit(boost::asio::cancellation_type::all); // noop
+  } while (ec == std::errc::no_such_file_or_directory && --tries);
+
+  EXPECT_EQ(ec, boost::asio::error::operation_aborted);
+}
+
+TEST_F(AsioRados, AsyncWriteOperationCancelTotal)
+{
+  boost::asio::io_context service;
+  boost::asio::cancellation_signal signal;
+  std::optional<error_code> ec;
+
+  librados::ObjectWriteOperation op;
+  op.assert_exists();
+  librados::async_operate(service, io, "noexist", &op, 0, nullptr,
+                          capture(signal, ec));
+
+  service.poll();
+  EXPECT_FALSE(service.stopped());
+  EXPECT_FALSE(ec);
+
+  // noop, write only supports terminal
+  signal.emit(boost::asio::cancellation_type::total);
+
+  service.run();
+  ASSERT_TRUE(ec);
+  EXPECT_EQ(ec, std::errc::no_such_file_or_directory);
+
+  signal.emit(boost::asio::cancellation_type::all); // noop
+}
+
+#endif // not _WIN32
+
 int main(int argc, char **argv)
 {
   auto args = argv_to_vec(argc, argv);
diff --git a/src/test/objectstore/ObjectStoreImitator.h b/src/test/objectstore/ObjectStoreImitator.h
index d71d7f2fe58..875f9041b83 100644
--- a/src/test/objectstore/ObjectStoreImitator.h
+++ b/src/test/objectstore/ObjectStoreImitator.h
@@ -347,6 +347,16 @@ public:
                     ) override {
     return {};
   }
+
+  int omap_iterate(CollectionHandle &c,   ///< [in] collection
+                   const ghobject_t &oid, ///< [in] object
+                   /// [in] where the iterator should point to at the beginning
+                   omap_iter_seek_t start_from,
+                   std::function<omap_iter_ret_t(std::string_view, std::string_view)> f
+                   ) override {
+    return 0;
+  }
+
   void set_fsid(uuid_d u) override {}
   uuid_d get_fsid() override { return {}; }
   uint64_t estimate_objects_overhead(uint64_t num_objects) override {
diff --git a/src/test/objectstore/allocsim/ops_replayer.cc b/src/test/objectstore/allocsim/ops_replayer.cc
index fd947f5c454..c5908d9f576 100644
--- a/src/test/objectstore/allocsim/ops_replayer.cc
+++ b/src/test/objectstore/allocsim/ops_replayer.cc
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <functional>
 #include <boost/program_options/value_semantic.hpp>
 #include <cassert>
 #include <cctype>
@@ -13,26 +14,46 @@
 #include <fstream>
 #include <filesystem>
 #include <mutex>
-#include "include/rados/buffer_fwd.h"
-#include "include/rados/librados.hpp"
 #include <atomic>
-#include <fmt/format.h>
 #include <map>
 #include <memory>
 #include <random>
 #include <string>
 #include <iostream>
 #include <vector>
+#include <format>
+
+#include <fmt/format.h>
 
 #include <boost/program_options/variables_map.hpp>
 #include <boost/program_options/parsers.hpp>
 
+#include "include/rados/buffer_fwd.h"
+#include "include/rados/librados.hpp"
+
 namespace po = boost::program_options;
 
 
 using namespace std;
 using namespace ceph;
 
+namespace settings {
+
+// Returns a function which restricts a value to a specified range by throwing if it is not in range:
+// (Note: std::clamp() does not throw.)
+auto clamp_or_throw(auto min, auto max)
+{
+ return [=](auto& x) { 
+		if(std::less<>{}(x, min) or std::greater<>{}(x, max)) {
+		 throw std::out_of_range(fmt::format("value expected between {} and {}, but got {}", min, max, x));
+		}
+
+		return x;	
+ 	};
+}
+
+} // namespace settings
+
 // compare shared_ptr<string>
 struct StringPtrCompare
 {
@@ -338,8 +359,8 @@ int main(int argc, char** argv) {
 
   // options
   uint64_t io_depth = 8;
-  uint64_t nparser_threads = 16;
-  uint64_t nworker_threads = 16;
+  int nparser_threads = 16;
+  int nworker_threads = 16;
   string file("input.txt");
   string ceph_conf_path("./ceph.conf");
   string pool("test_pool");
@@ -351,8 +372,8 @@ int main(int argc, char** argv) {
     ("input-files,i", po::value<vector<string>>()->multitoken(), "List of input files (output of op_scraper.py). Multiple files will be merged and sorted by time order")
     ("ceph-conf", po::value<string>(&ceph_conf_path)->default_value("ceph.conf"), "Path to ceph conf")
     ("io-depth", po::value<uint64_t>(&io_depth)->default_value(64), "I/O depth")
-    ("parser-threads", po::value<uint64_t>(&nparser_threads)->default_value(16), "Number of parser threads")
-    ("worker-threads", po::value<uint64_t>(&nworker_threads)->default_value(16), "Number of I/O worker threads")
+    ("parser-threads", po::value<int>(&nparser_threads)->default_value(16)->notifier(settings::clamp_or_throw(1, 256)), "Number of parser threads")
+    ("worker-threads", po::value<int>(&nworker_threads)->default_value(16)->notifier(settings::clamp_or_throw(1, 256)), "Number of I/O worker threads")
     ("pool", po::value<string>(&pool)->default_value("test_pool"), "Pool to use for I/O")
     ("skip-do-ops", po::bool_switch(&skip_do_ops)->default_value(false), "Skip doing operations")
     ;
diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt
index f2d1471e22e..798558ebbe0 100644
--- a/src/test/osd/CMakeLists.txt
+++ b/src/test/osd/CMakeLists.txt
@@ -22,7 +22,7 @@ install(TARGETS
 add_executable(ceph_test_rados_io_sequence
   ${CMAKE_CURRENT_SOURCE_DIR}/ceph_test_rados_io_sequence.cc)
 target_link_libraries(ceph_test_rados_io_sequence
-  librados global object_io_exerciser)
+  librados global object_io_exerciser json_structures)
 install(TARGETS
   ceph_test_rados_io_sequence
   DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/src/test/osd/ceph_test_rados_io_sequence.cc b/src/test/osd/ceph_test_rados_io_sequence.cc
index 4a768a016e2..96808ea37e5 100644
--- a/src/test/osd/ceph_test_rados_io_sequence.cc
+++ b/src/test/osd/ceph_test_rados_io_sequence.cc
@@ -1,83 +1,104 @@
 #include "ceph_test_rados_io_sequence.h"
 
+#include <boost/asio/io_context.hpp>
 #include <iostream>
 #include <vector>
 
-#include <boost/asio/io_context.hpp>
-
-#include "include/random.h"
-
-#include "librados/librados_asio.h"
-#include "common/ceph_argparse.h"
-#include "include/interval_set.h"
-#include "global/global_init.h"
-#include "global/global_context.h"
+#include "common/Formatter.h"
 #include "common/Thread.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_json.h"
 #include "common/debug.h"
 #include "common/dout.h"
 #include "common/split.h"
 #include "common/strtol.h" // for strict_iecstrtoll()
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
 
 #include "common/io_exerciser/DataGenerator.h"
+#include "common/io_exerciser/EcIoSequence.h"
+#include "common/io_exerciser/IoOp.h"
+#include "common/io_exerciser/IoSequence.h"
 #include "common/io_exerciser/Model.h"
 #include "common/io_exerciser/ObjectModel.h"
 #include "common/io_exerciser/RadosIo.h"
-#include "common/io_exerciser/IoOp.h"
-#include "common/io_exerciser/IoSequence.h"
+#include "common/json/BalancerStructures.h"
+#include "common/json/ConfigStructures.h"
+#include "common/json/OSDStructures.h"
+#include "fmt/format.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "include/interval_set.h"
+#include "include/random.h"
+#include "json_spirit/json_spirit.h"
+#include "librados/librados_asio.h"
 
 #define dout_subsys ceph_subsys_rados
 #define dout_context g_ceph_context
 
+using OpType = ceph::io_exerciser::OpType;
+
+using DoneOp = ceph::io_exerciser::DoneOp;
+using BarrierOp = ceph::io_exerciser::BarrierOp;
+using CreateOp = ceph::io_exerciser::CreateOp;
+using RemoveOp = ceph::io_exerciser::RemoveOp;
+using SingleReadOp = ceph::io_exerciser::SingleReadOp;
+using DoubleReadOp = ceph::io_exerciser::DoubleReadOp;
+using TripleReadOp = ceph::io_exerciser::TripleReadOp;
+using SingleWriteOp = ceph::io_exerciser::SingleWriteOp;
+using DoubleWriteOp = ceph::io_exerciser::DoubleWriteOp;
+using TripleWriteOp = ceph::io_exerciser::TripleWriteOp;
+using SingleFailedWriteOp = ceph::io_exerciser::SingleFailedWriteOp;
+using DoubleFailedWriteOp = ceph::io_exerciser::DoubleFailedWriteOp;
+using TripleFailedWriteOp = ceph::io_exerciser::TripleFailedWriteOp;
+
 namespace {
-  struct Size {};
-  void validate(boost::any& v, const std::vector<std::string>& values,
-                Size *target_type, int) {
-    po::validators::check_first_occurrence(v);
-    const std::string &s = po::validators::get_single_string(values);
-
-    std::string parse_error;
-    uint64_t size = strict_iecstrtoll(s, &parse_error);
-    if (!parse_error.empty()) {
-      throw po::validation_error(po::validation_error::invalid_option_value);
-    }
-    v = boost::any(size);
-  }
-
-  struct Pair {};
-  void validate(boost::any& v, const std::vector<std::string>& values,
-                Pair *target_type, int) {
-    po::validators::check_first_occurrence(v);
-    const std::string &s = po::validators::get_single_string(values);
-    auto part = ceph::split(s).begin();
-    std::string parse_error;
-    int first = strict_iecstrtoll(*part++, &parse_error);
-    int second = strict_iecstrtoll(*part, &parse_error);
-    if (!parse_error.empty()) {
-      throw po::validation_error(po::validation_error::invalid_option_value);
-    }
-    v = boost::any(std::pair<int,int>{first,second});
-  }
-
-  struct PluginString {};
-  void validate(boost::any& v, const std::vector<std::string>& values,
-                PluginString *target_type, int) {
-    po::validators::check_first_occurrence(v);
-    const std::string &s = po::validators::get_single_string(values);
-
-    const std::string_view* pluginIt = std::find(
-          ceph::io_sequence::tester::pluginChoices.begin(),
-          ceph::io_sequence::tester::pluginChoices.end(), 
-          s
-    );
-    if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt)
-    {
-      throw po::validation_error(po::validation_error::invalid_option_value);
-    }
+struct Size {};
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Size* target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string& s = po::validators::get_single_string(values);
 
-    v = boost::any(*pluginIt);
+  std::string parse_error;
+  uint64_t size = strict_iecstrtoll(s, &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
   }
+  v = boost::any(size);
+}
+
+struct Pair {};
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Pair* target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string& s = po::validators::get_single_string(values);
+  auto part = ceph::split(s).begin();
+  std::string parse_error;
+  int first = strict_iecstrtoll(*part++, &parse_error);
+  int second = strict_iecstrtoll(*part, &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+  v = boost::any(std::pair<int, int>{first, second});
+}
+
+struct PluginString {};
+void validate(boost::any& v, const std::vector<std::string>& values,
+              PluginString* target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string& s = po::validators::get_single_string(values);
+
+  const std::string_view* pluginIt =
+      std::find(ceph::io_sequence::tester::pluginChoices.begin(),
+                ceph::io_sequence::tester::pluginChoices.end(), s);
+  if (ceph::io_sequence::tester::pluginChoices.end() == pluginIt) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+
+  v = boost::any(*pluginIt);
+}
 
-  constexpr std::string_view usage[] = {
+constexpr std::string_view usage[] = {
     "Basic usage:",
     "",
     "ceph_test_rados_io_sequence",
@@ -119,103 +140,99 @@ namespace {
     "\t are specified with unit of blocksize. Supported commands:",
     "\t\t create <len>",
     "\t\t remove",
-    "\t\t read|write <off> <len>",
-    "\t\t read2|write2 <off> <len> <off> <len>",
-    "\t\t read3|write3 <off> <len> <off> <len> <off> <len>",
-    "\t\t done"
-  };
-
-  po::options_description get_options_description()
-  {
-    po::options_description desc("ceph_test_rados_io options");
-    desc.add_options()
-      ("help,h",
-        "show help message")
-      ("listsequence,l",
-        "show list of sequences")
-      ("dryrun,d",
-        "test sequence, do not issue any I/O")
-      ("verbose",
-        "more verbose output during test")
-      ("sequence,s", po::value<int>(),
-        "test specified sequence")
-      ("seed", po::value<int>(),
-        "seed for whole test")
-      ("seqseed", po::value<int>(),
-        "seed for sequence")
-      ("blocksize,b", po::value<Size>(),
-        "block size (default 2048)")
-      ("chunksize,c", po::value<Size>(),
-        "chunk size (default 4096)")
-      ("pool,p", po::value<std::string>(),
-        "pool name")
-      ("object,o", po::value<std::string>()->default_value("test"),
-        "object name")
-      ("km", po::value<Pair>(),
-        "k,m EC pool profile (default 2,2)")
-      ("plugin", po::value<PluginString>(),
-        "EC plugin (isa or jerasure)")
-      ("objectsize", po::value<Pair>(),
-        "min,max object size in blocks (default 1,32)")
-      ("threads,t", po::value<int>(),
-        "number of threads of I/O per object (default 1)")
-      ("parallel,p", po::value<int>()->default_value(1),
-        "number of objects to exercise in parallel")
-      ("interactive",
-        "interactive mode, execute IO commands from stdin");
-
-    return desc;
-  }
-
-  int parse_io_seq_options(
-      po::variables_map& vm,
-      int argc,
-      char** argv)
-  {
-    std::vector<std::string> unrecognized_options;
-    try {
-      po::options_description desc = get_options_description();
-
-      auto parsed = po::command_line_parser(argc, argv)
-        .options(desc)
-        .allow_unregistered()
-        .run();
-      po::store(parsed, vm);
-      po::notify(vm);
-      unrecognized_options = po::collect_unrecognized(parsed.options,
-                                                      po::include_positional);
-
-      if (!unrecognized_options.empty())
-      {
-        std::stringstream ss;
-        ss << "Unrecognised command options supplied: ";
-        while (unrecognized_options.size() > 1)
-        {
-          ss << unrecognized_options.back().c_str() << ", ";
-          unrecognized_options.pop_back();
-        }
-        ss << unrecognized_options.back();
-        dout(0) << ss.str() << dendl;
-        return 1;
+    "\t\t read|write|failedwrite <off> <len>",
+    "\t\t read2|write2|failedwrite2 <off> <len> <off> <len>",
+    "\t\t read3|write3|failedwrite3 <off> <len> <off> <len> <off> <len>",
+    "\t\t injecterror <type> <shard> <good_count> <fail_count>",
+    "\t\t clearinject <type> <shard>",
+    "\t\t done"};
+
+po::options_description get_options_description() {
+  po::options_description desc("ceph_test_rados_io options");
+  desc.add_options()("help,h", "show help message")("listsequence,l",
+                                                    "show list of sequences")(
+      "dryrun,d", "test sequence, do not issue any I/O")(
+      "verbose", "more verbose output during test")(
+      "sequence,s", po::value<int>(), "test specified sequence")(
+      "seed", po::value<int>(), "seed for whole test")(
+      "seqseed", po::value<int>(), "seed for sequence")(
+      "blocksize,b", po::value<Size>(), "block size (default 2048)")(
+      "chunksize,c", po::value<Size>(), "chunk size (default 4096)")(
+      "pool,p", po::value<std::string>(), "pool name")(
+      "object,o", po::value<std::string>()->default_value("test"),
+      "object name")("km", po::value<Pair>(),
+                     "k,m EC pool profile (default 2,2)")(
+      "plugin", po::value<PluginString>(), "EC plugin (isa or jerasure)")(
+      "objectsize", po::value<Pair>(),
+      "min,max object size in blocks (default 1,32)")(
+      "threads,t", po::value<int>(),
+      "number of threads of I/O per object (default 1)")(
+      "parallel,p", po::value<int>()->default_value(1),
+      "number of objects to exercise in parallel")(
+      "testrecovery",
+      "Inject errors during sequences to test recovery processes of OSDs")(
+      "interactive", "interactive mode, execute IO commands from stdin")(
+      "allow_pool_autoscaling",
+      "Allows pool autoscaling. Disabled by default.")(
+      "allow_pool_balancer", "Enables pool balancing. Disabled by default.")(
+      "allow_pool_deep_scrubbing",
+      "Enables pool deep scrub. Disabled by default.")(
+      "allow_pool_scrubbing", "Enables pool scrubbing. Disabled by default.");
+
+  return desc;
+}
+
+int parse_io_seq_options(po::variables_map& vm, int argc, char** argv) {
+  std::vector<std::string> unrecognized_options;
+  try {
+    po::options_description desc = get_options_description();
+
+    auto parsed = po::command_line_parser(argc, argv)
+                      .options(desc)
+                      .allow_unregistered()
+                      .run();
+    po::store(parsed, vm);
+    po::notify(vm);
+    unrecognized_options =
+        po::collect_unrecognized(parsed.options, po::include_positional);
+
+    if (!unrecognized_options.empty()) {
+      std::stringstream ss;
+      ss << "Unrecognised command options supplied: ";
+      while (unrecognized_options.size() > 1) {
+        ss << unrecognized_options.back().c_str() << ", ";
+        unrecognized_options.pop_back();
       }
-    } catch(const po::error& e) {
-      std::cerr << "error: " << e.what() << std::endl;
+      ss << unrecognized_options.back();
+      dout(0) << ss.str() << dendl;
       return 1;
     }
-
-    return 0;
+  } catch (const po::error& e) {
+    std::cerr << "error: " << e.what() << std::endl;
+    return 1;
   }
+
+  return 0;
 }
 
+template <typename S>
+int send_mon_command(S& s, librados::Rados& rados, const char* name,
+                     ceph::buffer::list& inbl, ceph::buffer::list* outbl, Formatter* f) {
+  std::ostringstream oss;
+  encode_json(name, s, f);
+  f->flush(oss);
+  int rc = rados.mon_command(oss.str(), inbl, outbl, nullptr);
+  return rc;
+}
+
+}  // namespace
+
 template <typename T, int N, const std::array<T, N>& Ts>
-ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>
-  ::ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
-                          po::variables_map vm,
-                          const std::string& option_name,
-                          bool set_forced,
-                          bool select_first)
-  : rng(rng),
-    option_name(option_name) {
+ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::
+    ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
+                          po::variables_map vm, const std::string& option_name,
+                          bool set_forced, bool select_first)
+    : rng(rng), option_name(option_name) {
   if (set_forced && vm.count(option_name)) {
     force_value = vm[option_name].as<T>();
   }
@@ -226,76 +243,54 @@ ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>
 }
 
 template <typename T, int N, const std::array<T, N>& Ts>
-bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced()
-{
+bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced() {
   return force_value.has_value();
 }
 
 template <typename T, int N, const std::array<T, N>& Ts>
-const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose()
-{
+const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose() {
   if (force_value.has_value()) {
     return *force_value;
   } else if (first_value.has_value()) {
     return *std::exchange(first_value, std::nullopt);
   } else {
-    return choices[rng(N-1)];
+    return choices[rng(N - 1)];
   }
 }
 
-
-
 ceph::io_sequence::tester::SelectObjectSize::SelectObjectSize(
-    ceph::util::random_number_generator<int>& rng,
-    po::variables_map vm)
-  : ProgramOptionSelector(rng, vm, "objectsize", true, true)
-{
-}
-
-
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "objectsize", true, true) {}
 
 ceph::io_sequence::tester::SelectBlockSize::SelectBlockSize(
-    ceph::util::random_number_generator<int>& rng,
-    po::variables_map vm)
-  : ProgramOptionSelector(rng, vm, "blocksize", true, true)
-{
-}
-
-
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "blocksize", true, true) {}
 
 ceph::io_sequence::tester::SelectNumThreads::SelectNumThreads(
-    ceph::util::random_number_generator<int>& rng,
-    po::variables_map vm)
-  : ProgramOptionSelector(rng, vm, "threads", true, true)
-{
-}
-
-
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "threads", true, true) {}
 
 ceph::io_sequence::tester::SelectSeqRange::SelectSeqRange(
-    ceph::util::random_number_generator<int>& rng,
-    po::variables_map vm) 
-  : ProgramOptionSelector(rng, vm, "sequence", false, false)
-{
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "sequence", false, false) {
   if (vm.count(option_name)) {
     ceph::io_exerciser::Sequence s =
-      static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>());
+        static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>());
     if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN ||
         s >= ceph::io_exerciser::Sequence::SEQUENCE_END) {
       dout(0) << "Sequence argument out of range" << dendl;
       throw po::validation_error(po::validation_error::invalid_option_value);
     }
     ceph::io_exerciser::Sequence e = s;
-    force_value = std::make_optional<std::pair<ceph::io_exerciser::Sequence,
-                                               ceph::io_exerciser::Sequence>>(
-                    std::make_pair(s, ++e));
+    force_value = std::make_optional<
+        std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>>(
+        std::make_pair(s, ++e));
   }
 }
 
-const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence>
-  ceph::io_sequence::tester::SelectSeqRange::choose() {
-  if (force_value.has_value())
-  {
+const std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>
+ceph::io_sequence::tester::SelectSeqRange::choose() {
+  if (force_value.has_value()) {
     return *force_value;
   } else {
     return std::make_pair(ceph::io_exerciser::Sequence::SEQUENCE_BEGIN,
@@ -303,45 +298,34 @@ const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence>
   }
 }
 
-
-
 ceph::io_sequence::tester::SelectErasureKM::SelectErasureKM(
-  ceph::util::random_number_generator<int>& rng,
-  po::variables_map vm)
-  : ProgramOptionSelector(rng, vm, "km", true, true)
-{
-}
-
-
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "km", true, true) {}
 
 ceph::io_sequence::tester::SelectErasurePlugin::SelectErasurePlugin(
-  ceph::util::random_number_generator<int>& rng,
-  po::variables_map vm)
-  : ProgramOptionSelector(rng, vm, "plugin", true, false)
-{
-}
-
-
-
-ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm)
-  : ProgramOptionSelector(rng, vm, "stripe_unit", true, false)
-{
-}
-
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "plugin", true, false) {}
 
+ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize(
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm)
+    : ProgramOptionSelector(rng, vm, "chunksize", true, true) {}
 
 ceph::io_sequence::tester::SelectECPool::SelectECPool(
-  ceph::util::random_number_generator<int>& rng,
-  po::variables_map vm,
-  librados::Rados& rados,
-  bool dry_run)
-  : ProgramOptionSelector(rng, vm, "pool", false, false),
-    rados(rados),
-    dry_run(dry_run),
-    skm(SelectErasureKM(rng, vm)),
-    spl(SelectErasurePlugin(rng, vm)),
-    scs(SelectErasureChunkSize(rng, vm))
-{
+    ceph::util::random_number_generator<int>& rng, po::variables_map vm,
+    librados::Rados& rados, bool dry_run, bool allow_pool_autoscaling,
+    bool allow_pool_balancer, bool allow_pool_deep_scrubbing,
+    bool allow_pool_scrubbing, bool test_recovery)
+    : ProgramOptionSelector(rng, vm, "pool", false, false),
+      rados(rados),
+      dry_run(dry_run),
+      allow_pool_autoscaling(allow_pool_autoscaling),
+      allow_pool_balancer(allow_pool_balancer),
+      allow_pool_deep_scrubbing(allow_pool_deep_scrubbing),
+      allow_pool_scrubbing(allow_pool_scrubbing),
+      test_recovery(test_recovery),
+      skm(SelectErasureKM(rng, vm)),
+      spl(SelectErasurePlugin(rng, vm)),
+      scs(SelectErasureChunkSize(rng, vm)) {
   if (!skm.isForced()) {
     if (vm.count("pool")) {
       force_value = vm["pool"].as<std::string>();
@@ -349,147 +333,239 @@ ceph::io_sequence::tester::SelectECPool::SelectECPool(
   }
 }
 
-const std::string ceph::io_sequence::tester::SelectECPool::choose()
-{
-  std::pair<int,int> value;
+const std::string ceph::io_sequence::tester::SelectECPool::choose() {
+  std::pair<int, int> value;
   if (!skm.isForced() && force_value.has_value()) {
+    int rc;
+    bufferlist inbl, outbl;
+    auto formatter = std::make_unique<JSONFormatter>(false);
+
+    ceph::messaging::osd::OSDPoolGetRequest osdPoolGetRequest{*force_value};
+    rc = send_mon_command(osdPoolGetRequest, rados, "OSDPoolGetRequest", inbl,
+                          &outbl, formatter.get());
+    ceph_assert(rc == 0);
+
+    JSONParser p;
+    bool success = p.parse(outbl.c_str(), outbl.length());
+    ceph_assert(success);
+
+    ceph::messaging::osd::OSDPoolGetReply osdPoolGetReply;
+    osdPoolGetReply.decode_json(&p);
+
+    ceph::messaging::osd::OSDECProfileGetRequest osdECProfileGetRequest{
+        osdPoolGetReply.erasure_code_profile};
+    rc = send_mon_command(osdECProfileGetRequest, rados,
+                          "OSDECProfileGetRequest", inbl, &outbl,
+                          formatter.get());
+    ceph_assert(rc == 0);
+
+    success = p.parse(outbl.c_str(), outbl.length());
+    ceph_assert(success);
+
+    ceph::messaging::osd::OSDECProfileGetReply reply;
+    reply.decode_json(&p);
+    k = reply.k;
+    m = reply.m;
     return *force_value;
   } else {
     value = skm.choose();
   }
-  int k = value.first;
-  int m = value.second;
+  k = value.first;
+  m = value.second;
 
   const std::string plugin = std::string(spl.choose());
   const uint64_t chunk_size = scs.choose();
 
-  std::string pool_name = "ec_" + plugin +
-                          "_cs" + std::to_string(chunk_size) +
-                          "_k" + std::to_string(k) +
-                          "_m" + std::to_string(m);
-  if (!dry_run)
-  {
+  std::string pool_name = "ec_" + plugin + "_cs" + std::to_string(chunk_size) +
+                          "_k" + std::to_string(k) + "_m" + std::to_string(m);
+  if (!dry_run) {
     create_pool(rados, pool_name, plugin, chunk_size, k, m);
   }
   return pool_name;
 }
 
 void ceph::io_sequence::tester::SelectECPool::create_pool(
-  librados::Rados& rados,
-  const std::string& pool_name,
-  const std::string& plugin,
-  uint64_t chunk_size,
-  int k, int m)
-{
+    librados::Rados& rados, const std::string& pool_name,
+    const std::string& plugin, uint64_t chunk_size, int k, int m) {
   int rc;
   bufferlist inbl, outbl;
-  std::string profile_create =
-    "{\"prefix\": \"osd erasure-code-profile set\", \
-    \"name\": \"testprofile-" + pool_name + "\", \
-    \"profile\": [ \"plugin=" + plugin + "\", \
-    \"k=" + std::to_string(k) + "\", \
-    \"m=" + std::to_string(m) + "\", \
-    \"stripe_unit=" + std::to_string(chunk_size) + "\", \
-    \"crush-failure-domain=osd\"]}";
-  rc = rados.mon_command(profile_create, inbl, &outbl, nullptr);
+  auto formatter = std::make_unique<JSONFormatter>(false);
+
+  ceph::messaging::osd::OSDECProfileSetRequest ecProfileSetRequest{
+      fmt::format("testprofile-{}", pool_name),
+      {fmt::format("plugin={}", plugin), fmt::format("k={}", k),
+       fmt::format("m={}", m), fmt::format("stripe_unit={}", chunk_size),
+       fmt::format("crush-failure-domain=osd")}};
+  rc = send_mon_command(ecProfileSetRequest, rados, "OSDECProfileSetRequest",
+                        inbl, &outbl, formatter.get());
   ceph_assert(rc == 0);
-  std::string cmdstr =
-    "{\"prefix\": \"osd pool create\", \
-    \"pool\": \"" + pool_name + "\", \
-    \"pool_type\": \"erasure\", \
-    \"pg_num\": 8, \
-    \"pgp_num\": 8, \
-    \"erasure_code_profile\": \"testprofile-" + pool_name + "\"}";
-  rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr);
+
+  ceph::messaging::osd::OSDECPoolCreateRequest poolCreateRequest{
+      pool_name, "erasure", 8, 8, fmt::format("testprofile-{}", pool_name)};
+  rc = send_mon_command(poolCreateRequest, rados, "OSDECPoolCreateRequest",
+                        inbl, &outbl, formatter.get());
   ceph_assert(rc == 0);
-}
 
+  if (allow_pool_autoscaling) {
+    ceph::messaging::osd::OSDSetRequest setNoAutoscaleRequest{"noautoscale",
+                                                              std::nullopt};
+    rc = send_mon_command(setNoAutoscaleRequest, rados, "OSDSetRequest", inbl,
+                          &outbl, formatter.get());
+    ceph_assert(rc == 0);
+  }
+
+  if (allow_pool_balancer) {
+    ceph::messaging::balancer::BalancerOffRequest balancerOffRequest{};
+    rc = send_mon_command(balancerOffRequest, rados, "BalancerOffRequest", inbl,
+                          &outbl, formatter.get());
+    ceph_assert(rc == 0);
+
+    ceph::messaging::balancer::BalancerStatusRequest balancerStatusRequest{};
+    rc = send_mon_command(balancerStatusRequest, rados, "BalancerStatusRequest",
+                          inbl, &outbl, formatter.get());
+    ceph_assert(rc == 0);
+
+    JSONParser p;
+    bool success = p.parse(outbl.c_str(), outbl.length());
+    ceph_assert(success);
+
+    ceph::messaging::balancer::BalancerStatusReply reply;
+    reply.decode_json(&p);
+    ceph_assert(!reply.active);
+  }
 
+  if (allow_pool_deep_scrubbing) {
+    ceph::messaging::osd::OSDSetRequest setNoDeepScrubRequest{"nodeep-scrub",
+                                                              std::nullopt};
+    rc = send_mon_command(setNoDeepScrubRequest, rados, "setNoDeepScrubRequest",
+                          inbl, &outbl, formatter.get());
+    ceph_assert(rc == 0);
+  }
+
+  if (allow_pool_scrubbing) {
+    ceph::messaging::osd::OSDSetRequest setNoScrubRequest{"noscrub",
+                                                          std::nullopt};
+    rc = send_mon_command(setNoScrubRequest, rados, "OSDSetRequest", inbl,
+                          &outbl, formatter.get());
+    ceph_assert(rc == 0);
+  }
+
+  if (test_recovery) {
+    ceph::messaging::config::ConfigSetRequest configSetBluestoreDebugRequest{
+        "global", "bluestore_debug_inject_read_err", "true", std::nullopt};
+    rc = send_mon_command(configSetBluestoreDebugRequest, rados,
+                          "ConfigSetRequest", inbl, &outbl,
+                          formatter.get());
+    ceph_assert(rc == 0);
+
+    ceph::messaging::config::ConfigSetRequest configSetMaxMarkdownRequest{
+        "global", "osd_max_markdown_count", "99999999", std::nullopt};
+    rc =
+        send_mon_command(configSetMaxMarkdownRequest, rados, "ConfigSetRequest",
+                         inbl, &outbl, formatter.get());
+    ceph_assert(rc == 0);
+  }
+}
 
-ceph::io_sequence::tester::TestObject::TestObject( const std::string oid,
-                        librados::Rados& rados,
-                        boost::asio::io_context& asio,
-                        SelectBlockSize& sbs,
-                        SelectECPool& spo,
-                        SelectObjectSize& sos,
-                        SelectNumThreads& snt,
-                        SelectSeqRange& ssr,
-                        ceph::util::random_number_generator<int>& rng,
-                        ceph::mutex& lock,
-                        ceph::condition_variable& cond,
-                        bool dryrun,
-                        bool verbose,
-                        std::optional<int>  seqseed) :
-  rng(rng), verbose(verbose), seqseed(seqseed)
-{
+ceph::io_sequence::tester::TestObject::TestObject(
+    const std::string oid, librados::Rados& rados,
+    boost::asio::io_context& asio, SelectBlockSize& sbs, SelectECPool& spo,
+    SelectObjectSize& sos, SelectNumThreads& snt, SelectSeqRange& ssr,
+    ceph::util::random_number_generator<int>& rng, ceph::mutex& lock,
+    ceph::condition_variable& cond, bool dryrun, bool verbose,
+    std::optional<int> seqseed, bool testrecovery)
+    : rng(rng), verbose(verbose), seqseed(seqseed), testrecovery(testrecovery) {
   if (dryrun) {
-    verbose = true;
-    exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(oid,
-                                                                        sbs.choose(),
-                                                                        rng());
+    exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(
+        oid, sbs.choose(), rng());
   } else {
     const std::string pool = spo.choose();
+    poolK = spo.getChosenK();
+    poolM = spo.getChosenM();
+
     int threads = snt.choose();
-    exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(rados,
-                                                                    asio,
-                                                                    pool,
-                                                                    oid,
-                                                                    sbs.choose(),
-                                                                    rng(),
-                                                                    threads,
-                                                                    lock,
-                                                                    cond);
-    dout(0) << "= " << oid << " pool=" << pool
-            << " threads=" << threads
-            << " blocksize=" << exerciser_model->get_block_size()
-            << " =" << dendl;
+
+    bufferlist inbl, outbl;
+    auto formatter = std::make_unique<JSONFormatter>(false);
+
+    std::optional<std::vector<int>> cached_shard_order = std::nullopt;
+
+    if (!spo.get_allow_pool_autoscaling() && !spo.get_allow_pool_balancer() &&
+        !spo.get_allow_pool_deep_scrubbing() &&
+        !spo.get_allow_pool_scrubbing()) {
+      ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, oid, ""};
+      int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", inbl,
+                                &outbl, formatter.get());
+      ceph_assert(rc == 0);
+
+      JSONParser p;
+      bool success = p.parse(outbl.c_str(), outbl.length());
+      ceph_assert(success);
+
+      ceph::messaging::osd::OSDMapReply reply{};
+      reply.decode_json(&p);
+      cached_shard_order = reply.acting;
+    }
+
+    exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(
+        rados, asio, pool, oid, cached_shard_order, sbs.choose(), rng(),
+        threads, lock, cond);
+    dout(0) << "= " << oid << " pool=" << pool << " threads=" << threads
+            << " blocksize=" << exerciser_model->get_block_size() << " ="
+            << dendl;
   }
   obj_size_range = sos.choose();
   seq_range = ssr.choose();
   curseq = seq_range.first;
-  seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
-                                                          obj_size_range,
-                                                          seqseed.value_or(rng()));
+
+  if (testrecovery) {
+    seq = ceph::io_exerciser::EcIoSequence::generate_sequence(
+        curseq, obj_size_range, poolK, poolM, seqseed.value_or(rng()));
+  } else {
+    seq = ceph::io_exerciser::IoSequence::generate_sequence(
+        curseq, obj_size_range, seqseed.value_or(rng()));
+  }
+
   op = seq->next();
   done = false;
-  dout(0) << "== " << exerciser_model->get_oid() << " "
-          << curseq << " "
-          << seq->get_name()
-          << " ==" <<dendl;
+  dout(0) << "== " << exerciser_model->get_oid() << " " << curseq << " "
+          << seq->get_name_with_seqseed() << " ==" << dendl;
 }
 
-bool ceph::io_sequence::tester::TestObject::readyForIo()
-{
+bool ceph::io_sequence::tester::TestObject::readyForIo() {
   return exerciser_model->readyForIoOp(*op);
 }
 
-bool ceph::io_sequence::tester::TestObject::next()
-{
+bool ceph::io_sequence::tester::TestObject::next() {
   if (!done) {
     if (verbose) {
-      dout(0) << exerciser_model->get_oid()
-              << " Step " << seq->get_step() << ": "
-              << op->to_string(exerciser_model->get_block_size()) << dendl;
+      dout(0) << exerciser_model->get_oid() << " Step " << seq->get_step()
+              << ": " << op->to_string(exerciser_model->get_block_size())
+              << dendl;
     } else {
-      dout(5) << exerciser_model->get_oid()
-              << " Step " << seq->get_step() << ": "
-              << op->to_string(exerciser_model->get_block_size()) << dendl;
+      dout(5) << exerciser_model->get_oid() << " Step " << seq->get_step()
+              << ": " << op->to_string(exerciser_model->get_block_size())
+              << dendl;
     }
     exerciser_model->applyIoOp(*op);
-    if (op->done()) {
-      ++curseq;
-      if (curseq == seq_range.second) {
+    if (op->getOpType() == ceph::io_exerciser::OpType::Done) {
+      curseq = seq->getNextSupportedSequenceId();
+      if (curseq >= seq_range.second) {
         done = true;
         dout(0) << exerciser_model->get_oid()
                 << " Number of IOs = " << exerciser_model->get_num_io()
                 << dendl;
       } else {
-        seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq,
-                                                                obj_size_range,
-                                                                seqseed.value_or(rng()));
-        dout(0) << "== " << exerciser_model->get_oid() << " "
-                << curseq << " " << seq->get_name()
-                << " ==" <<dendl;
+        if (testrecovery) {
+          seq = ceph::io_exerciser::EcIoSequence::generate_sequence(
+              curseq, obj_size_range, poolK, poolM, seqseed.value_or(rng()));
+        } else {
+          seq = ceph::io_exerciser::IoSequence::generate_sequence(
+              curseq, obj_size_range, seqseed.value_or(rng()));
+        }
+
+        dout(0) << "== " << exerciser_model->get_oid() << " " << curseq << " "
+                << seq->get_name_with_seqseed() << " ==" << dendl;
         op = seq->next();
       }
     } else {
@@ -499,27 +575,30 @@ bool ceph::io_sequence::tester::TestObject::next()
   return done;
 }
 
-bool ceph::io_sequence::tester::TestObject::finished()
-{
-  return done;
-}
+bool ceph::io_sequence::tester::TestObject::finished() { return done; }
 
-int ceph::io_sequence::tester::TestObject::get_num_io()
-{
+int ceph::io_sequence::tester::TestObject::get_num_io() {
   return exerciser_model->get_num_io();
 }
 
 ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm,
-                                                  librados::Rados& rados) :
-  rados(rados),
-  seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)),
-  rng(ceph::util::random_number_generator<int>(seed)),
-  sbs{rng, vm},
-  sos{rng, vm},
-  spo{rng, vm, rados, vm.contains("dryrun")},
-  snt{rng, vm},
-  ssr{rng, vm}
-{
+                                                  librados::Rados& rados)
+    : rados(rados),
+      seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)),
+      rng(ceph::util::random_number_generator<int>(seed)),
+      sbs{rng, vm},
+      sos{rng, vm},
+      spo{rng,
+          vm,
+          rados,
+          vm.contains("dryrun"),
+          vm.contains("allow_pool_autoscaling"),
+          vm.contains("allow_pool_balancer"),
+          vm.contains("allow_pool_deep_scrubbing"),
+          vm.contains("allow_pool_scrubbing"),
+          vm.contains("test_recovery")},
+      snt{rng, vm},
+      ssr{rng, vm} {
   dout(0) << "Test using seed " << seed << dendl;
 
   verbose = vm.contains("verbose");
@@ -532,19 +611,23 @@ ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm,
   num_objects = vm["parallel"].as<int>();
   object_name = vm["object"].as<std::string>();
   interactive = vm.contains("interactive");
+  testrecovery = vm.contains("testrecovery");
+
+  allow_pool_autoscaling = vm.contains("allow_pool_autoscaling");
+  allow_pool_balancer = vm.contains("allow_pool_balancer");
+  allow_pool_deep_scrubbing = vm.contains("allow_pool_deep_scrubbing");
+  allow_pool_scrubbing = vm.contains("allow_pool_scrubbing");
 
-  if (!dryrun)
-  {
+  if (!dryrun) {
     guard.emplace(boost::asio::make_work_guard(asio));
-    thread = make_named_thread("io_thread",[&asio = asio] { asio.run(); });
+    thread = make_named_thread("io_thread", [&asio = asio] { asio.run(); });
   }
 
   show_help = vm.contains("help");
   show_sequence = vm.contains("listsequence");
 }
 
-ceph::io_sequence::tester::TestRunner::~TestRunner()
-{
+ceph::io_sequence::tester::TestRunner::~TestRunner() {
   if (!dryrun) {
     guard = std::nullopt;
     asio.stop();
@@ -553,34 +636,38 @@ ceph::io_sequence::tester::TestRunner::~TestRunner()
   }
 }
 
-void ceph::io_sequence::tester::TestRunner::help()
-{
+void ceph::io_sequence::tester::TestRunner::help() {
   std::cout << get_options_description() << std::endl;
   for (auto line : usage) {
     std::cout << line << std::endl;
   }
 }
 
-void ceph::io_sequence::tester::TestRunner::list_sequence()
-{
+void ceph::io_sequence::tester::TestRunner::list_sequence(bool testrecovery) {
   // List seqeunces
-  std::pair<int,int> obj_size_range = sos.choose();
-  for (ceph::io_exerciser::Sequence s
-        = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN;
-        s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) {
-    std::unique_ptr<ceph::io_exerciser::IoSequence> seq =
-    ceph::io_exerciser::IoSequence::generate_sequence(s,
-                                                      obj_size_range,
-                                                      seqseed.value_or(rng()));
-    dout(0) << s << " " << seq->get_name() << dendl;
+  std::pair<int, int> obj_size_range = sos.choose();
+  ceph::io_exerciser::Sequence s = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN;
+  std::unique_ptr<ceph::io_exerciser::IoSequence> seq;
+  if (testrecovery) {
+    seq = ceph::io_exerciser::EcIoSequence::generate_sequence(
+        s, obj_size_range, spo.getChosenK(), spo.getChosenM(),
+        seqseed.value_or(rng()));
+  } else {
+    seq = ceph::io_exerciser::IoSequence::generate_sequence(
+        s, obj_size_range, seqseed.value_or(rng()));
   }
+
+  do {
+    dout(0) << s << " " << seq->get_name_with_seqseed() << dendl;
+    s = seq->getNextSupportedSequenceId();
+  } while (s != ceph::io_exerciser::Sequence::SEQUENCE_END);
 }
 
-std::string ceph::io_sequence::tester::TestRunner::get_token()
-{
-  static std::string line;
-  static ceph::split split = ceph::split("");
-  static ceph::spliterator tokens;
+void ceph::io_sequence::tester::TestRunner::clear_tokens() {
+  tokens = split.end();
+}
+
+std::string ceph::io_sequence::tester::TestRunner::get_token() {
   while (line.empty() || tokens == split.end()) {
     if (!std::getline(std::cin, line)) {
       throw std::runtime_error("End of input");
@@ -591,127 +678,211 @@ std::string ceph::io_sequence::tester::TestRunner::get_token()
   return std::string(*tokens++);
 }
 
-uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token()
-{
+std::optional<std::string>
+ceph::io_sequence::tester::TestRunner ::get_optional_token() {
+  std::optional<std::string> ret = std::nullopt;
+  if (tokens != split.end()) {
+    ret = std::string(*tokens++);
+  }
+  return ret;
+}
+
+uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token() {
   std::string parse_error;
   std::string token = get_token();
   uint64_t num = strict_iecstrtoll(token, &parse_error);
   if (!parse_error.empty()) {
-    throw std::runtime_error("Invalid number "+token);
+    throw std::runtime_error("Invalid number " + token);
   }
   return num;
 }
 
-bool ceph::io_sequence::tester::TestRunner::run_test()
-{
-  if (show_help)
-  {
+std::optional<uint64_t>
+ceph::io_sequence::tester::TestRunner ::get_optional_numeric_token() {
+  std::string parse_error;
+  std::optional<std::string> token = get_optional_token();
+  if (token) {
+    uint64_t num = strict_iecstrtoll(*token, &parse_error);
+    if (!parse_error.empty()) {
+      throw std::runtime_error("Invalid number " + *token);
+    }
+    return num;
+  }
+
+  return std::optional<uint64_t>(std::nullopt);
+}
+
+bool ceph::io_sequence::tester::TestRunner::run_test() {
+  if (show_help) {
     help();
     return true;
-  }
-  else if (show_sequence)
-  {
-    list_sequence();
+  } else if (show_sequence) {
+    list_sequence(testrecovery);
     return true;
-  }
-  else if (interactive)
-  {
+  } else if (interactive) {
     return run_interactive_test();
-  }
-  else
-  {
+  } else {
     return run_automated_test();
   }
 }
 
-bool ceph::io_sequence::tester::TestRunner::run_interactive_test()
-{
+bool ceph::io_sequence::tester::TestRunner::run_interactive_test() {
   bool done = false;
   std::unique_ptr<ceph::io_exerciser::IoOp> ioop;
   std::unique_ptr<ceph::io_exerciser::Model> model;
 
   if (dryrun) {
-    model = std::make_unique<ceph::io_exerciser::ObjectModel>(object_name,
-				                              sbs.choose(),
-				                              rng());
+    model = std::make_unique<ceph::io_exerciser::ObjectModel>(
+        object_name, sbs.choose(), rng());
   } else {
     const std::string pool = spo.choose();
-    model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, asio, pool,
-                                                          object_name, sbs.choose(),
-                                                          rng(), 1, // 1 thread
-                                                          lock, cond);
+
+    bufferlist inbl, outbl;
+    auto formatter = std::make_unique<JSONFormatter>(false);
+
+    ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, object_name, ""};
+    int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", inbl,
+                              &outbl, formatter.get());
+    ceph_assert(rc == 0);
+
+    JSONParser p;
+    bool success = p.parse(outbl.c_str(), outbl.length());
+    ceph_assert(success);
+
+    ceph::messaging::osd::OSDMapReply reply{};
+    reply.decode_json(&p);
+
+    model = std::make_unique<ceph::io_exerciser::RadosIo>(
+        rados, asio, pool, object_name, reply.acting, sbs.choose(), rng(),
+        1,  // 1 thread
+        lock, cond);
   }
 
   while (!done) {
     const std::string op = get_token();
-    if (!op.compare("done")  || !op.compare("q") || !op.compare("quit")) {
-      ioop = ceph::io_exerciser::IoOp::generate_done();
-    } else if (!op.compare("create")) {
-      ioop = ceph::io_exerciser::IoOp::generate_create(get_numeric_token());
-    } else if (!op.compare("remove") || !op.compare("delete")) {
-      ioop = ceph::io_exerciser::IoOp::generate_remove();
-    } else if (!op.compare("read")) {
+    if (op == "done" || op == "q" || op == "quit") {
+      ioop = ceph::io_exerciser::DoneOp::generate();
+    } else if (op == "create") {
+      ioop = ceph::io_exerciser::CreateOp::generate(get_numeric_token());
+    } else if (op == "remove" || op == "delete") {
+      ioop = ceph::io_exerciser::RemoveOp::generate();
+    } else if (op == "read") {
       uint64_t offset = get_numeric_token();
       uint64_t length = get_numeric_token();
-      ioop = ceph::io_exerciser::IoOp::generate_read(offset, length);
-    } else if (!op.compare("read2")) {
+      ioop = ceph::io_exerciser::SingleReadOp::generate(offset, length);
+    } else if (op == "read2") {
       uint64_t offset1 = get_numeric_token();
       uint64_t length1 = get_numeric_token();
       uint64_t offset2 = get_numeric_token();
       uint64_t length2 = get_numeric_token();
-      ioop = ceph::io_exerciser::IoOp::generate_read2(offset1, length1,
-                                                      offset2, length2);
-    } else if (!op.compare("read3")) {
+      ioop = DoubleReadOp::generate(offset1, length1, offset2, length2);
+    } else if (op == "read3") {
       uint64_t offset1 = get_numeric_token();
       uint64_t length1 = get_numeric_token();
       uint64_t offset2 = get_numeric_token();
       uint64_t length2 = get_numeric_token();
       uint64_t offset3 = get_numeric_token();
       uint64_t length3 = get_numeric_token();
-      ioop = ceph::io_exerciser::IoOp::generate_read3(offset1, length1,
-                                                      offset2, length2,
-				                      offset3, length3);
-    } else if (!op.compare("write")) {
+      ioop = TripleReadOp::generate(offset1, length1, offset2, length2, offset3,
+                                    length3);
+    } else if (op == "write") {
       uint64_t offset = get_numeric_token();
       uint64_t length = get_numeric_token();
-      ioop = ceph::io_exerciser::IoOp::generate_write(offset, length);
-    } else if (!op.compare("write2")) {
+      ioop = SingleWriteOp::generate(offset, length);
+    } else if (op == "write2") {
       uint64_t offset1 = get_numeric_token();
       uint64_t length1 = get_numeric_token();
       uint64_t offset2 = get_numeric_token();
       uint64_t length2 = get_numeric_token();
-      ioop = ceph::io_exerciser::IoOp::generate_write2(offset1, length1,
-                                                       offset2, length2);
-    } else if (!op.compare("write3")) {
+      ioop = DoubleWriteOp::generate(offset1, length1, offset2, length2);
+    } else if (op == "write3") {
       uint64_t offset1 = get_numeric_token();
       uint64_t length1 = get_numeric_token();
       uint64_t offset2 = get_numeric_token();
       uint64_t length2 = get_numeric_token();
       uint64_t offset3 = get_numeric_token();
       uint64_t length3 = get_numeric_token();
-      ioop = ceph::io_exerciser::IoOp::generate_write3(offset1, length1,
-                                                       offset2, length2,
-				                       offset3, length3);
+      ioop = TripleWriteOp::generate(offset1, length1, offset2, length2,
+                                     offset3, length3);
+    } else if (op == "failedwrite") {
+      uint64_t offset = get_numeric_token();
+      uint64_t length = get_numeric_token();
+      ioop = SingleFailedWriteOp::generate(offset, length);
+    } else if (op == "failedwrite2") {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      ioop = DoubleFailedWriteOp::generate(offset1, length1, offset2, length2);
+    } else if (op == "failedwrite3") {
+      uint64_t offset1 = get_numeric_token();
+      uint64_t length1 = get_numeric_token();
+      uint64_t offset2 = get_numeric_token();
+      uint64_t length2 = get_numeric_token();
+      uint64_t offset3 = get_numeric_token();
+      uint64_t length3 = get_numeric_token();
+      ioop = TripleFailedWriteOp::generate(offset1, length1, offset2, length2,
+                                           offset3, length3);
+    } else if (op == "injecterror") {
+      std::string inject_type = get_token();
+      int shard = get_numeric_token();
+      std::optional<int> type = get_optional_numeric_token();
+      std::optional<int> when = get_optional_numeric_token();
+      std::optional<int> duration = get_optional_numeric_token();
+      if (inject_type == "read") {
+        ioop = ceph::io_exerciser::InjectReadErrorOp::generate(shard, type,
+                                                               when, duration);
+      } else if (inject_type == "write") {
+        ioop = ceph::io_exerciser::InjectWriteErrorOp::generate(shard, type,
+                                                                when, duration);
+      } else {
+        clear_tokens();
+        ioop.reset();
+        dout(0) << fmt::format("Invalid error inject {}. No action performed.",
+                               inject_type)
+                << dendl;
+      }
+    } else if (op == "clearinject") {
+      std::string inject_type = get_token();
+      int shard = get_numeric_token();
+      std::optional<int> type = get_optional_numeric_token();
+      if (inject_type == "read") {
+        ioop =
+            ceph::io_exerciser::ClearReadErrorInjectOp::generate(shard, type);
+      } else if (inject_type == "write") {
+        ioop =
+            ceph::io_exerciser::ClearWriteErrorInjectOp::generate(shard, type);
+      } else {
+        clear_tokens();
+        ioop.reset();
+        dout(0) << fmt::format("Invalid error inject {}. No action performed.",
+                               inject_type)
+                << dendl;
+      }
     } else {
-      throw std::runtime_error("Invalid operation "+op);
+      clear_tokens();
+      ioop.reset();
+      dout(0) << fmt::format("Invalid op {}. No action performed.", op)
+              << dendl;
     }
-    dout(0) << ioop->to_string(model->get_block_size()) << dendl;
-    model->applyIoOp(*ioop);
-    done = ioop->done();
-    if (!done) {
-      ioop = ceph::io_exerciser::IoOp::generate_barrier();
+    if (ioop) {
+      dout(0) << ioop->to_string(model->get_block_size()) << dendl;
       model->applyIoOp(*ioop);
+      done = ioop->getOpType() == ceph::io_exerciser::OpType::Done;
+      if (!done) {
+        ioop = ceph::io_exerciser::BarrierOp::generate();
+        model->applyIoOp(*ioop);
+      }
     }
   }
 
   return true;
 }
 
-bool ceph::io_sequence::tester::TestRunner::run_automated_test()
-{
+bool ceph::io_sequence::tester::TestRunner::run_automated_test() {
   // Create a test for each object
-  std::vector<std::shared_ptr<
-    ceph::io_sequence::tester::TestObject>> test_objects;
+  std::vector<std::shared_ptr<ceph::io_sequence::tester::TestObject>>
+      test_objects;
 
   for (int obj = 0; obj < num_objects; obj++) {
     std::string name;
@@ -721,15 +892,9 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test()
       name = object_name + std::to_string(obj);
     }
     test_objects.push_back(
-      std::make_shared<ceph::io_sequence::tester::TestObject>(
-            name,
-            rados, asio,
-            sbs, spo, sos, snt, ssr,
-            rng, lock, cond,
-            dryrun, verbose,
-            seqseed
-      )
-    );
+        std::make_shared<ceph::io_sequence::tester::TestObject>(
+            name, rados, asio, sbs, spo, sos, snt, ssr, rng, lock, cond, dryrun,
+            verbose, seqseed, testrecovery));
   }
   if (!dryrun) {
     rados.wait_for_latest_osdmap();
@@ -748,16 +913,15 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test()
     for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) {
       std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
       if (!to->finished()) {
-	lock.lock();
-	bool ready = to->readyForIo();
-	lock.unlock();
-	if (ready)
-	{
-	  to->next();
-	  started_io = true;
-	} else {
-	  need_wait = true;
-	}
+        lock.lock();
+        bool ready = to->readyForIo();
+        lock.unlock();
+        if (ready) {
+          to->next();
+          started_io = true;
+        } else {
+          need_wait = true;
+        }
       }
     }
     if (!started_io && need_wait) {
@@ -767,8 +931,7 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test()
         std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj;
         if (!to->finished()) {
           need_wait = !to->readyForIo();
-          if (!need_wait)
-          {
+          if (!need_wait) {
             break;
           }
         }
@@ -788,18 +951,16 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test()
   return true;
 }
 
-int main(int argc, char **argv)
-{
+int main(int argc, char** argv) {
   auto args = argv_to_vec(argc, argv);
   env_to_vec(args);
   auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
-			 CODE_ENVIRONMENT_UTILITY, 0);
+                         CODE_ENVIRONMENT_UTILITY, 0);
   common_init_finish(cct.get());
 
   po::variables_map vm;
   int rc = parse_io_seq_options(vm, argc, argv);
-  if (rc != 0)
-  {
+  if (rc != 0) {
     return rc;
   }
 
@@ -814,7 +975,7 @@ int main(int argc, char **argv)
   std::unique_ptr<ceph::io_sequence::tester::TestRunner> runner;
   try {
     runner = std::make_unique<ceph::io_sequence::tester::TestRunner>(vm, rados);
-  } catch(const po::error& e) {
+  } catch (const po::error& e) {
     return 1;
   }
   runner->run_test();
diff --git a/src/test/osd/ceph_test_rados_io_sequence.h b/src/test/osd/ceph_test_rados_io_sequence.h
index 4e21d025700..9af5f706b2f 100644
--- a/src/test/osd/ceph_test_rados_io_sequence.h
+++ b/src/test/osd/ceph_test_rados_io_sequence.h
@@ -1,34 +1,36 @@
+#include <boost/program_options.hpp>
+#include <optional>
 #include <utility>
 
-#include "include/random.h"
-
-#include "global/global_init.h"
-#include "global/global_context.h"
-
 #include "common/io_exerciser/IoOp.h"
 #include "common/io_exerciser/IoSequence.h"
 #include "common/io_exerciser/Model.h"
-
+#include "common/split.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "include/random.h"
 #include "librados/librados_asio.h"
 
 #include <boost/asio/io_context.hpp>
 #include <boost/program_options.hpp>
 
+#include <optional>
+
 /* Overview
  *
  * class ProgramOptionSelector
- *   Base class for selector objects below with common code for 
+ *   Base class for selector objects below with common code for
  *   selecting options
- * 
+ *
  * class SelectObjectSize
  *   Selects min and max object sizes for a test
  *
  * class SelectErasureKM
  *   Selects an EC k and m value for a test
- * 
+ *
  * class SelectErasurePlugin
  *   Selects an plugin for a test
- * 
+ *
  * class SelectECPool
  *   Selects an EC pool (plugin,k and m) for a test. Also creates the
  *   pool as well.
@@ -58,287 +60,279 @@
 
 namespace po = boost::program_options;
 
-namespace ceph
-{
-  namespace io_sequence::tester
-  {
-    // Choices for min and max object size
-    inline constexpr size_t objectSizeSize = 10;
-    inline constexpr std::array<std::pair<int,int>,objectSizeSize> 
-                        objectSizeChoices = {{
-      {1,32},  // Default - best for boundary checking
-      {12,14},
-      {28,30},
-      {36,38},
-      {42,44},
-      {52,54},
-      {66,68},
-      {72,74},
-      {83,83},
-      {97,97}
-    }};
-
-    // Choices for block size
-    inline constexpr int blockSizeSize = 5;
-    inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = {{
-      2048, // Default - test boundaries for EC 4K chunk size
-      512,
-      3767,
-      4096,
-      32768
-    }};
-
-    // Choices for number of threads
-    inline constexpr int threadArraySize = 4;
-    inline constexpr std::array<int, threadArraySize> threadCountChoices = {{
-      1, // Default
-      2,
-      4,
-      8
-    }};
-
-    // Choices for EC k+m profile
-    inline constexpr int kmSize = 6;
-    inline constexpr std::array<std::pair<int,int>, kmSize> kmChoices = {{
-      {2,2}, // Default - reasonable coverage
-      {2,1},
-      {2,3},
-      {3,2},
-      {4,2},
-      {5,1}
-    }};
-
-    // Choices for EC chunk size
-    inline constexpr int chunkSizeSize = 3;
-    inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = {{
-      4*1024,
-      64*1024,
-      256*1024
-    }};
-
-    // Choices for plugin
-    inline constexpr int pluginListSize = 2;
-    inline constexpr std::array<std::string_view,
-                                pluginListSize> pluginChoices = {{
-      "jerasure",
-      "isa"
-    }};
-
-    inline constexpr std::array<std::pair<ceph::io_exerciser::Sequence,
-                                          ceph::io_exerciser::Sequence>, 
-                                0> sequencePairs = {{}};
-
-    inline constexpr std::array<std::string, 0> poolChoices = {{}};
-
-    template <typename T, int N, const std::array<T, N>& Ts>
-    class ProgramOptionSelector
-    {
-    public:
-      ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
-                            po::variables_map vm,
-                            const std::string& option_name,
-                            bool set_forced,
-                            bool select_first
-                           );
-      virtual ~ProgramOptionSelector() = default;
-      bool isForced();
-      virtual const T choose();
-
-    protected:
-      ceph::util::random_number_generator<int>& rng;
-      static constexpr std::array<T, N> choices = Ts;
-
-      std::optional<T> force_value;
-      std::optional<T> first_value;
-
-      std::string option_name;
-    };
-
-    class SelectObjectSize
-      : public ProgramOptionSelector<std::pair<int, int>,
-                                     io_sequence::tester::objectSizeSize,
-                                     io_sequence::tester::objectSizeChoices>
-    {
-    public:
-      SelectObjectSize(ceph::util::random_number_generator<int>& rng,
-                      po::variables_map vm);  
-    };
-
-    class SelectBlockSize
-      : public ProgramOptionSelector<uint64_t, 
-                                     io_sequence::tester::blockSizeSize,
-                                     io_sequence::tester::blockSizeChoices>
-    {
-    public:
-      SelectBlockSize(ceph::util::random_number_generator<int>& rng,
-                      po::variables_map vm);
-    };
-
-    class SelectNumThreads
-      : public ProgramOptionSelector<int, 
-                                     io_sequence::tester::threadArraySize,
-                                     io_sequence::tester::threadCountChoices>
-    {
-    public:
-      SelectNumThreads(ceph::util::random_number_generator<int>& rng,
-                       po::variables_map vm);
-    };
-
-    class SelectSeqRange
-      : public ProgramOptionSelector<std::pair<ceph::io_exerciser::Sequence,
-                                               ceph::io_exerciser::Sequence>,
-                                     0, io_sequence::tester::sequencePairs>
-    {
-    public:
-      SelectSeqRange(ceph::util::random_number_generator<int>& rng,
-                     po::variables_map vm);
-
-      const std::pair<ceph::io_exerciser::Sequence,
-                      ceph::io_exerciser::Sequence> choose() override;
-    };
-
-    class SelectErasureKM
-      : public ProgramOptionSelector<std::pair<int,int>,
-                                     io_sequence::tester::kmSize,
-                                     io_sequence::tester::kmChoices>
-    {
-    public:
-      SelectErasureKM(ceph::util::random_number_generator<int>& rng,
+namespace ceph {
+namespace io_sequence::tester {
+// Choices for min and max object size
+inline constexpr size_t objectSizeSize = 10;
+inline constexpr std::array<std::pair<int, int>, objectSizeSize>
+    objectSizeChoices = {{{1, 32},  // Default - best for boundary checking
+                          {12, 14},
+                          {28, 30},
+                          {36, 38},
+                          {42, 44},
+                          {52, 54},
+                          {66, 68},
+                          {72, 74},
+                          {83, 83},
+                          {97, 97}}};
+
+// Choices for block size
+inline constexpr int blockSizeSize = 5;
+inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = {
+    {2048,  // Default - test boundaries for EC 4K chunk size
+     512, 3767, 4096, 32768}};
+
+// Choices for number of threads
+inline constexpr int threadArraySize = 4;
+inline constexpr std::array<int, threadArraySize> threadCountChoices = {
+    {1,  // Default
+     2, 4, 8}};
+
+// Choices for EC k+m profile
+inline constexpr int kmSize = 6;
+inline constexpr std::array<std::pair<int, int>, kmSize> kmChoices = {
+    {{2, 2},  // Default - reasonable coverage
+     {2, 1},
+     {2, 3},
+     {3, 2},
+     {4, 2},
+     {5, 1}}};
+
+// Choices for EC chunk size
+inline constexpr int chunkSizeSize = 3;
+inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = {
+    {4 * 1024, 64 * 1024, 256 * 1024}};
+
+// Choices for plugin
+inline constexpr int pluginListSize = 2;
+inline constexpr std::array<std::string_view, pluginListSize> pluginChoices = {
+    {"jerasure", "isa"}};
+
+inline constexpr std::array<
+    std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>, 0>
+    sequencePairs = {{}};
+
+inline constexpr std::array<std::string, 0> poolChoices = {{}};
+
+template <typename T, int N, const std::array<T, N>& Ts>
+class ProgramOptionSelector {
+ public:
+  ProgramOptionSelector(ceph::util::random_number_generator<int>& rng,
+                        po::variables_map vm, const std::string& option_name,
+                        bool set_forced, bool select_first);
+  virtual ~ProgramOptionSelector() = default;
+  bool isForced();
+  virtual const T choose();
+
+ protected:
+  ceph::util::random_number_generator<int>& rng;
+  static constexpr std::array<T, N> choices = Ts;
+
+  std::optional<T> force_value;
+  std::optional<T> first_value;
+
+  std::string option_name;
+};
+
+class SelectObjectSize
+    : public ProgramOptionSelector<std::pair<int, int>,
+                                   io_sequence::tester::objectSizeSize,
+                                   io_sequence::tester::objectSizeChoices> {
+ public:
+  SelectObjectSize(ceph::util::random_number_generator<int>& rng,
+                   po::variables_map vm);
+};
+
+class SelectBlockSize
+    : public ProgramOptionSelector<uint64_t, io_sequence::tester::blockSizeSize,
+                                   io_sequence::tester::blockSizeChoices> {
+ public:
+  SelectBlockSize(ceph::util::random_number_generator<int>& rng,
+                  po::variables_map vm);
+};
+
+class SelectNumThreads
+    : public ProgramOptionSelector<int, io_sequence::tester::threadArraySize,
+                                   io_sequence::tester::threadCountChoices> {
+ public:
+  SelectNumThreads(ceph::util::random_number_generator<int>& rng,
+                   po::variables_map vm);
+};
+
+class SelectSeqRange
+    : public ProgramOptionSelector<
+          std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>,
+          0, io_sequence::tester::sequencePairs> {
+ public:
+  SelectSeqRange(ceph::util::random_number_generator<int>& rng,
+                 po::variables_map vm);
+
+  const std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>
+  choose() override;
+};
+
+class SelectErasureKM
+    : public ProgramOptionSelector<std::pair<int, int>,
+                                   io_sequence::tester::kmSize,
+                                   io_sequence::tester::kmChoices> {
+ public:
+  SelectErasureKM(ceph::util::random_number_generator<int>& rng,
+                  po::variables_map vm);
+};
+
+class SelectErasurePlugin
+    : public ProgramOptionSelector<std::string_view,
+                                   io_sequence::tester::pluginListSize,
+                                   io_sequence::tester::pluginChoices> {
+ public:
+  SelectErasurePlugin(ceph::util::random_number_generator<int>& rng,
                       po::variables_map vm);
-    };
-
-    class SelectErasurePlugin
-      : public ProgramOptionSelector<std::string_view,
-                                     io_sequence::tester::pluginListSize,
-                                     io_sequence::tester::pluginChoices>
-        {
-    public:
-      SelectErasurePlugin(ceph::util::random_number_generator<int>& rng,
-                          po::variables_map vm);
-    };
-
-    class SelectErasureChunkSize 
-      : public ProgramOptionSelector<uint64_t, 
-                                     io_sequence::tester::chunkSizeSize,
-                                     io_sequence::tester::chunkSizeChoices>
-    {
-    public:
-      SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm);
-    };
-
-    class SelectECPool
-      : public ProgramOptionSelector<std::string,
-                                     0,
-                                     io_sequence::tester::poolChoices>
-    { 
-    public:
-      SelectECPool(ceph::util::random_number_generator<int>& rng,
-                   po::variables_map vm,
-                   librados::Rados& rados,
-                   bool dry_run);
-      const std::string choose() override;
-
-    private:
-      void create_pool(librados::Rados& rados,
-                       const std::string& pool_name,
-                       const std::string& plugin,
-                       uint64_t chunk_size,
-                       int k, int m);
-
-    protected:
-      librados::Rados& rados;
-      bool dry_run;
-      
-      SelectErasureKM skm;
-      SelectErasurePlugin spl;
-      SelectErasureChunkSize scs;
-    };
-
-    class TestObject
-    {
-    public:
-      TestObject( const std::string oid,
-                  librados::Rados& rados,
-                  boost::asio::io_context& asio,
-                  ceph::io_sequence::tester::SelectBlockSize& sbs,
-                  ceph::io_sequence::tester::SelectECPool& spl,
-                  ceph::io_sequence::tester::SelectObjectSize& sos,
-                  ceph::io_sequence::tester::SelectNumThreads& snt,
-                  ceph::io_sequence::tester::SelectSeqRange& ssr,
-                  ceph::util::random_number_generator<int>& rng,
-                  ceph::mutex& lock,
-                  ceph::condition_variable& cond,
-                  bool dryrun,
-                  bool verbose,
-                  std::optional<int>  seqseed);
-      
-      int get_num_io();
-      bool readyForIo();
-      bool next();
-      bool finished();
-
-    protected:
-      std::unique_ptr<ceph::io_exerciser::Model> exerciser_model;
-      std::pair<int,int> obj_size_range;
-      std::pair<ceph::io_exerciser::Sequence,
-                ceph::io_exerciser::Sequence> seq_range;
-      ceph::io_exerciser::Sequence curseq;
-      std::unique_ptr<ceph::io_exerciser::IoSequence> seq;
-      std::unique_ptr<ceph::io_exerciser::IoOp> op;
-      bool done;
-      ceph::util::random_number_generator<int>& rng;
-      bool verbose;
-      std::optional<int> seqseed;
-    };
-
-    class TestRunner
-    {
-    public:
-      TestRunner(po::variables_map& vm, librados::Rados& rados);
-      ~TestRunner();
-
-      bool run_test();
-
-    private:
-      librados::Rados& rados;
-      int seed;
-      ceph::util::random_number_generator<int> rng;
-
-      ceph::io_sequence::tester::SelectBlockSize sbs;
-      ceph::io_sequence::tester::SelectObjectSize sos;
-      ceph::io_sequence::tester::SelectECPool spo;
-      ceph::io_sequence::tester::SelectNumThreads snt;
-      ceph::io_sequence::tester::SelectSeqRange ssr;
-
-      boost::asio::io_context asio;
-      std::thread thread;
-      std::optional<boost::asio::executor_work_guard<
-                    boost::asio::io_context::executor_type>> guard;
-      ceph::mutex lock = ceph::make_mutex("RadosIo::lock");
-      ceph::condition_variable cond;
-
-      bool input_valid;
-
-      bool verbose;
-      bool dryrun;
-      std::optional<int> seqseed;
-      bool interactive;
-
-      bool show_sequence;
-      bool show_help;
-
-      int num_objects;
-      std::string object_name;
-
-      std::string get_token();
-      uint64_t get_numeric_token();
-
-      bool run_automated_test();
-
-      bool run_interactive_test();
-
-      void help();
-      void list_sequence();
-    };
-  }
-}
+};
+
+class SelectErasureChunkSize
+    : public ProgramOptionSelector<uint64_t, io_sequence::tester::chunkSizeSize,
+                                   io_sequence::tester::chunkSizeChoices> {
+ public:
+  SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng,
+                         po::variables_map vm);
+};
+
+class SelectECPool
+    : public ProgramOptionSelector<std::string, 0,
+                                   io_sequence::tester::poolChoices> {
+ public:
+  SelectECPool(ceph::util::random_number_generator<int>& rng,
+               po::variables_map vm, librados::Rados& rados, bool dry_run,
+               bool allow_pool_autoscaling, bool allow_pool_balancer,
+               bool allow_pool_deep_scrubbing, bool allow_pool_scrubbing,
+               bool test_recovery);
+  const std::string choose() override;
+
+  bool get_allow_pool_autoscaling() { return allow_pool_autoscaling; }
+  bool get_allow_pool_balancer() { return allow_pool_balancer; }
+  bool get_allow_pool_deep_scrubbing() { return allow_pool_deep_scrubbing; }
+  bool get_allow_pool_scrubbing() { return allow_pool_scrubbing; }
+  int getChosenK() const { return k; }
+  int getChosenM() const { return m; }
+
+ private:
+  void create_pool(librados::Rados& rados, const std::string& pool_name,
+                   const std::string& plugin, uint64_t chunk_size, int k,
+                   int m);
+
+ protected:
+  librados::Rados& rados;
+  bool dry_run;
+  bool allow_pool_autoscaling;
+  bool allow_pool_balancer;
+  bool allow_pool_deep_scrubbing;
+  bool allow_pool_scrubbing;
+  bool test_recovery;
+  int k;
+  int m;
+
+  SelectErasureKM skm;
+  SelectErasurePlugin spl;
+  SelectErasureChunkSize scs;
+};
+
+class TestObject {
+ public:
+  TestObject(const std::string oid, librados::Rados& rados,
+             boost::asio::io_context& asio,
+             ceph::io_sequence::tester::SelectBlockSize& sbs,
+             ceph::io_sequence::tester::SelectECPool& spl,
+             ceph::io_sequence::tester::SelectObjectSize& sos,
+             ceph::io_sequence::tester::SelectNumThreads& snt,
+             ceph::io_sequence::tester::SelectSeqRange& ssr,
+             ceph::util::random_number_generator<int>& rng, ceph::mutex& lock,
+             ceph::condition_variable& cond, bool dryrun, bool verbose,
+             std::optional<int> seqseed, bool testRecovery);
+
+  int get_num_io();
+  bool readyForIo();
+  bool next();
+  bool finished();
+
+ protected:
+  std::unique_ptr<ceph::io_exerciser::Model> exerciser_model;
+  std::pair<int, int> obj_size_range;
+  std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>
+      seq_range;
+  ceph::io_exerciser::Sequence curseq;
+  std::unique_ptr<ceph::io_exerciser::IoSequence> seq;
+  std::unique_ptr<ceph::io_exerciser::IoOp> op;
+  bool done;
+  ceph::util::random_number_generator<int>& rng;
+  bool verbose;
+  std::optional<int> seqseed;
+  int poolK;
+  int poolM;
+  bool testrecovery;
+};
+
+class TestRunner {
+ public:
+  TestRunner(po::variables_map& vm, librados::Rados& rados);
+  ~TestRunner();
+
+  bool run_test();
+
+ private:
+  librados::Rados& rados;
+  int seed;
+  ceph::util::random_number_generator<int> rng;
+
+  ceph::io_sequence::tester::SelectBlockSize sbs;
+  ceph::io_sequence::tester::SelectObjectSize sos;
+  ceph::io_sequence::tester::SelectECPool spo;
+  ceph::io_sequence::tester::SelectNumThreads snt;
+  ceph::io_sequence::tester::SelectSeqRange ssr;
+
+  boost::asio::io_context asio;
+  std::thread thread;
+  std::optional<
+      boost::asio::executor_work_guard<boost::asio::io_context::executor_type>>
+      guard;
+  ceph::mutex lock = ceph::make_mutex("RadosIo::lock");
+  ceph::condition_variable cond;
+
+  bool input_valid;
+
+  bool verbose;
+  bool dryrun;
+  std::optional<int> seqseed;
+  bool interactive;
+
+  bool testrecovery;
+
+  bool allow_pool_autoscaling;
+  bool allow_pool_balancer;
+  bool allow_pool_deep_scrubbing;
+  bool allow_pool_scrubbing;
+
+  bool show_sequence;
+  bool show_help;
+
+  int num_objects;
+  std::string object_name;
+
+  std::string line;
+  ceph::split split = ceph::split("");
+  ceph::spliterator tokens;
+
+  void clear_tokens();
+  std::string get_token();
+  std::optional<std::string> get_optional_token();
+  uint64_t get_numeric_token();
+  std::optional<uint64_t> get_optional_numeric_token();
+
+  bool run_automated_test();
+
+  bool run_interactive_test();
+
+  void help();
+  void list_sequence(bool testrecovery);
+};
+}  // namespace io_sequence::tester
+}  // namespace ceph
diff --git a/src/test/pybind/pytest.ini b/src/test/pybind/pytest.ini
index dccf2a346dc..97569e88299 100644
--- a/src/test/pybind/pytest.ini
+++ b/src/test/pybind/pytest.ini
@@ -7,3 +7,4 @@ markers =
     stats
     tier
     watch
+    wait
diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py
index cb2a4f96101..25423bd8dcb 100644
--- a/src/test/pybind/test_rados.py
+++ b/src/test/pybind/test_rados.py
@@ -207,7 +207,7 @@ class TestRados(object):
 
     def test_get_fsid(self):
         fsid = self.rados.get_fsid()
-        assert re.match('[0-9a-f\-]{36}', fsid, re.I)
+        assert re.match(r'[0-9a-f\-]{36}', fsid, re.I)
 
     def test_blocklist_add(self):
         self.rados.blocklist_add("1.2.3.4/123", 1)
diff --git a/src/test/rgw/rgw_multi/tests.py b/src/test/rgw/rgw_multi/tests.py
index d95feb5aa95..433cd034fe0 100644
--- a/src/test/rgw/rgw_multi/tests.py
+++ b/src/test/rgw/rgw_multi/tests.py
@@ -15,6 +15,7 @@ import boto
 import boto.s3.connection
 from boto.s3.website import WebsiteConfiguration
 from boto.s3.cors import CORSConfiguration
+from botocore.exceptions import ClientError
 
 from nose.tools import eq_ as eq
 from nose.tools import assert_not_equal, assert_equal, assert_true, assert_false
@@ -3638,4 +3639,23 @@ def test_copy_object_different_bucket():
         CopySource = source_bucket.name + '/' + objname)
     
     zonegroup_bucket_checkpoint(zonegroup_conns, dest_bucket.name)
-    
+
+def test_bucket_create_location_constraint():
+    for zonegroup in realm.current_period.zonegroups:
+        zonegroup_conns = ZonegroupConns(zonegroup)
+        for zg in realm.current_period.zonegroups:
+            z = zonegroup_conns.rw_zones[0]
+            bucket_name = gen_bucket_name()
+            if zg.name == zonegroup.name:
+                # my zonegroup should pass
+                z.s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': zg.name})
+                # check bucket location
+                response = z.s3_client.get_bucket_location(Bucket=bucket_name)
+                assert_equal(response['LocationConstraint'], zg.name)
+            else:
+                # other zonegroup should fail with 400
+                e = assert_raises(ClientError,
+                                  z.s3_client.create_bucket,
+                                    Bucket=bucket_name,
+                                    CreateBucketConfiguration={'LocationConstraint': zg.name})
+                assert e.response['ResponseMetadata']['HTTPStatusCode'] == 400
diff --git a/src/test/rgw/test_rgw_iam_policy.cc b/src/test/rgw/test_rgw_iam_policy.cc
index 7dadb7812ff..1d13c2aa013 100644
--- a/src/test/rgw/test_rgw_iam_policy.cc
+++ b/src/test/rgw/test_rgw_iam_policy.cc
@@ -75,6 +75,8 @@ using rgw::IAM::s3GetObjectTagging;
 using rgw::IAM::s3GetObjectVersion;
 using rgw::IAM::s3GetObjectVersionTagging;
 using rgw::IAM::s3GetObjectVersionTorrent;
+using rgw::IAM::s3GetObjectAttributes;
+using rgw::IAM::s3GetObjectVersionAttributes;
 using rgw::IAM::s3GetPublicAccessBlock;
 using rgw::IAM::s3GetReplicationConfiguration;
 using rgw::IAM::s3ListAllMyBuckets;
@@ -419,6 +421,8 @@ TEST_F(PolicyTest, Parse3) {
   act2[s3GetObjectVersionAcl] = 1;
   act2[s3GetObjectTorrent] = 1;
   act2[s3GetObjectVersionTorrent] = 1;
+  act2[s3GetObjectAttributes] = 1;
+  act2[s3GetObjectVersionAttributes] = 1;
   act2[s3GetAccelerateConfiguration] = 1;
   act2[s3GetBucketAcl] = 1;
   act2[s3GetBucketOwnershipControls] = 1;
@@ -487,6 +491,8 @@ TEST_F(PolicyTest, Eval3) {
   s3allow[s3GetObjectVersion] = 1;
   s3allow[s3GetObjectAcl] = 1;
   s3allow[s3GetObjectVersionAcl] = 1;
+  s3allow[s3GetObjectAttributes] = 1;
+  s3allow[s3GetObjectVersionAttributes] = 1;
   s3allow[s3GetObjectTorrent] = 1;
   s3allow[s3GetObjectVersionTorrent] = 1;
   s3allow[s3GetAccelerateConfiguration] = 1;
@@ -883,6 +889,8 @@ TEST_F(ManagedPolicyTest, AmazonS3ReadOnlyAccess)
   act[s3GetObjectVersionAcl] = 1;
   act[s3GetObjectTorrent] = 1;
   act[s3GetObjectVersionTorrent] = 1;
+  act[s3GetObjectAttributes] = 1;
+  act[s3GetObjectVersionAttributes] = 1;
   act[s3GetAccelerateConfiguration] = 1;
   act[s3GetBucketAcl] = 1;
   act[s3GetBucketOwnershipControls] = 1;
diff --git a/src/test/test_ipaddr.cc b/src/test/test_ipaddr.cc
index 49038815318..21df1d4056b 100644
--- a/src/test/test_ipaddr.cc
+++ b/src/test/test_ipaddr.cc
@@ -995,3 +995,158 @@ TEST(pick_address, ipv4_ipv6_enabled2)
     ASSERT_EQ(-1, r);
   }
 }
+
+// Test for IPv4 address
+TEST(is_addr_in_subnet, ipv4)
+{
+  std::string public_network = "10.1.1.0/24";
+  entity_addr_t addr;
+  addr.parse("10.1.1.2", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv4", "true");
+  cct->_conf.set_val("ms_bind_ipv6", "false");
+
+  bool r = is_addr_in_subnet(cct.get(), public_network, addr);
+  ASSERT_EQ(true, r);
+}
+
+// Test for IPv6 address
+TEST(is_addr_in_subnet, ipv6)
+{
+  std::string public_network = "2001:db8::/64";
+  entity_addr_t addr;
+  addr.parse("2001:db8::1", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv6", "true");
+  cct->_conf.set_val("ms_bind_ipv4", "false");
+
+  bool r = is_addr_in_subnet(cct.get(), public_network, addr);
+  ASSERT_EQ(true, r);
+}
+
+// Test for invalid address
+TEST(is_addr_in_subnet, invalid_address)
+{
+  std::string public_network = "10.1.1.0/24";
+  entity_addr_t addr;
+  addr.parse("192.168.1.1", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv4", "true");
+  cct->_conf.set_val("ms_bind_ipv6", "false");
+
+  bool r = is_addr_in_subnet(cct.get(), public_network, addr);
+  ASSERT_EQ(false, r);
+}
+
+// Test for malformed address
+TEST(is_addr_in_subnet, malformed_address)
+{
+  std::string public_network = "10.1.1.0/24";
+  entity_addr_t addr;
+  addr.parse("invalid_address", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv4", "true");
+  cct->_conf.set_val("ms_bind_ipv6", "false");
+
+  // Test with a malformed address
+  bool r = is_addr_in_subnet(cct.get(), public_network, addr);
+  ASSERT_EQ(false, r);
+}
+
+TEST(is_addr_in_subnet, boundary_ipv4)
+{
+  std::string public_network = "10.1.1.0/24";
+  entity_addr_t addr_low;
+  addr_low.parse("10.1.1.0", nullptr);
+  entity_addr_t addr_high;
+  addr_high.parse("10.1.1.255", nullptr);
+  entity_addr_t addr_out;
+  addr_out.parse("10.1.2.0", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv4", "true");
+  cct->_conf.set_val("ms_bind_ipv6", "false");
+
+  ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_low));
+  ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_high));
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network, addr_out));
+}
+
+TEST(is_addr_in_subnet, boundary_ipv6)
+{
+  std::string public_network = "2001:db8::/64";
+  entity_addr_t addr_low;
+  addr_low.parse("2001:db8::", nullptr);
+  entity_addr_t addr_high;
+  addr_high.parse("2001:db8:0:0:ffff:ffff:ffff:ffff", nullptr);
+  entity_addr_t addr_out;
+  addr_out.parse("2001:db9::", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv6", "true");
+  cct->_conf.set_val("ms_bind_ipv4", "false");
+
+  ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_low));
+  ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_high));
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network, addr_out));
+}
+
+TEST(is_addr_in_subnet, overlapping_subnets)
+{
+  std::string public_network_1 = "10.1.1.0/24";
+  std::string public_network_2 = "10.1.2.0/24";
+  entity_addr_t addr;
+  addr.parse("10.1.1.5", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv4", "true");
+  cct->_conf.set_val("ms_bind_ipv6", "false");
+
+  ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network_1, addr));
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr));
+}
+
+TEST(is_addr_in_subnet, mismatched_family)
+{
+  std::string public_network_1 = "2001:db8::/64";
+  entity_addr_t addr_1;
+  addr_1.parse("10.1.1.5", nullptr);
+  
+  std::string public_network_2 = "10.1.1.0/24";
+  entity_addr_t addr_2;
+  addr_2.parse("2001:db8::1", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+  cct->_conf.set_val("ms_bind_ipv4", "true");
+  cct->_conf.set_val("ms_bind_ipv6", "true");
+
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_1, addr_1));
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr_2));
+}
+
+TEST(is_addr_in_subnet, invalid_subnets)
+{
+  std::string public_network_1 = "10.1.1.0/33";
+  std::string public_network_2 = "25.0.0.99/10";
+  entity_addr_t addr;
+  addr.parse("10.1.1.2", nullptr);
+
+  boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false);
+  cct->_conf._clear_safe_to_start_threads();
+
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_1, addr)); // Invalid prefix
+  ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr)); // Invalid subnet string
+}
+
diff --git a/src/vstart.sh b/src/vstart.sh
index b445e250a00..a992f33c856 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -159,6 +159,7 @@ smallmds=0
 short=0
 crimson=0
 ec=0
+cephexporter=0
 cephadm=0
 parallel=true
 restart=1
@@ -233,6 +234,7 @@ options:
 	-G disable Kerberos/GSSApi authentication
 	--hitset <pool> <hit_set_type>: enable hitset tracking
 	-e : create an erasure pool
+	--cephexporter: start the ceph-exporter daemon
 	-o config add extra config parameters to all sections
 	--rgw_port specify ceph rgw http listen port
 	--rgw_frontend specify the rgw frontend configuration
@@ -372,6 +374,9 @@ case $1 in
     -e)
         ec=1
         ;;
+    --cephexporter)
+        cephexporter=1
+        ;;
     --new | -n)
         new=1
         ;;
@@ -1140,6 +1145,17 @@ EOF
     fi
 }
 
+start_cephexporter() {
+    debug echo "Starting Ceph exporter daemon..."
+
+    # Define socket directory for the exporter
+    # Start the exporter daemon 
+    prunb ceph-exporter \
+        -c "$conf_fn" \
+        --sock-dir "$CEPH_ASOK_DIR" \
+        --addrs "$IP"
+}
+
 start_osd() {
     if [ $inc_osd_num -gt 0 ]; then
         old_maxosd=$($CEPH_BIN/ceph osd getmaxosd | sed -e 's/max_osd = //' -e 's/ in epoch.*//')
@@ -1738,6 +1754,10 @@ if [ $CEPH_NUM_MDS -gt 0 ]; then
     ceph_adm fs authorize \* "client.fs" / rwp >> "$keyring_fn"
 fi
 
+if [ "$cephexporter" -eq 1 ]; then
+    start_cephexporter
+fi
+
 # Don't set max_mds until all the daemons are started, otherwise
 # the intended standbys might end up in active roles.
 if [ "$CEPH_MAX_MDS" -gt 1 ]; then