diff options
428 files changed, 11886 insertions, 4475 deletions
diff --git a/.githubmap b/.githubmap index 5265fa59bed..c8ae6e284a2 100644 --- a/.githubmap +++ b/.githubmap @@ -9,6 +9,7 @@ # a2batic Kanika Murarka <kmurarka@redhat.com> aaSharma14 Aashish Sharma <aasharma@redhat.com> +abhishek-kane Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com> aclamk Adam Kupczyk <akupczyk@redhat.com> adamemerson Adam C. Emerson <aemerson@redhat.com> adk3798 Adam King <adking@redhat.com> @@ -13,6 +13,7 @@ Aashish Sharma <aasharma@redhat.com> <66050535+aaSharma14@users.noreply.github.c Aashish Sharma <aasharma@redhat.com> <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com> Aashish Sharma <aasharma@redhat.com> <aashishsharma@fedora.redhat.com> Aashish Sharma <aasharma@redhat.com> <aashishsharma@localhost.localdomain> +Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com> Abhishek Lekshmanan <abhishek.lekshmanan@cern.ch> <abhishek.l@cern.ch> Abhishek Lekshmanan <abhishek@suse.com> <abhishek.lekshmanan@gmail.com> Abhishek Lekshmanan <abhishek@suse.com> <alekshmanan@suse.com> diff --git a/.organizationmap b/.organizationmap index e59e6ae24e1..ac9b0ea70fe 100644 --- a/.organizationmap +++ b/.organizationmap @@ -345,6 +345,7 @@ Huawei <contact@huawei.com> Yehu <yehu5@huawei.com> Huayun <contact@huayun.com> Zheng Yin <zhengyin@huayun.com> Huazhong University of Science and Technology <contact@hust.edu.cn> Luo Runbing <runsisi@hust.edu.cn> HXT Semiconductor <contact@hxt-semitech.org> Jiang Yutang <yutang2.jiang@hxt-semitech.com> +IBM <contact@IBM.com> Abhishek Kane <abhishek.kane@ibm.com> IBM <contact@IBM.com> Adam Kupczyk <akupczyk@ibm.com> IBM <contact@IBM.com> Afreen Misbah <afreen@ibm.com> IBM <contact@IBM.com> Aliaksei Makarau <aliaksei.makarau@ibm.com> diff --git a/.peoplemap b/.peoplemap index 418e8505fb4..ed70830c092 100644 --- a/.peoplemap +++ b/.peoplemap @@ -16,6 +16,7 @@ # # git log --pretty='%aN <%aE>' $range | git -c mailmap.file=.peoplemap check-mailmap --stdin | sort | uniq | sed -e 's/\(.*\) \(<.*\)/\2 \1/' | uniq --skip-field=1 --all-repeated | sed -e 's/\(.*>\) \(.*\)/\2 \1/' # +Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com> Abhishek Lekshmanan <abhishek.lekshmanan@cern.ch> <abhishek@suse.com> Adam Kupczyk <akupczyk@ibm.com> <akupczyk@redhat.com> <akupczyk@mirantis.com> Alexandre Marangone <amarango@redhat.com> Alexandre Marangone <alexandre.marangone@inktank.com> diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 1b0a75e01a1..d25acfa9c6d 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -1,5 +1,16 @@ >=20.0.0 +* RGW: The User Account feature introduced in Squid provides first-class support for + IAM APIs and policy. Our preliminary STS support was instead based on tenants, and + exposed some IAM APIs to admins only. This tenant-level IAM functionality is now + deprecated in favor of accounts. While we'll continue to support the tenant feature + itself for namespace isolation, the following features will be removed no sooner + than the V release: + * tenant-level IAM APIs like CreateRole, PutRolePolicy and PutUserPolicy, + * use of tenant names instead of accounts in IAM policy documents, + * interpretation of IAM policy without cross-account policy evaluation, + * S3 API support for cross-tenant names such as `Bucket='tenant:bucketname'` + * RBD: All Python APIs that produce timestamps now return "aware" `datetime` objects instead of "naive" ones (i.e. those including time zone information instead of those not including it). All timestamps remain to be in UTC but @@ -34,6 +45,8 @@ (--yes-i-really-mean-it). This has been added as a precaution to tell the users that modifying "max_mds" may not help with troubleshooting or recovery effort. Instead, it might further destabilize the cluster. +* RADOS: Added convenience function `librados::AioCompletion::cancel()` with + the same behavior as `librados::IoCtx::aio_cancel()`. * mgr/restful, mgr/zabbix: both modules, already deprecated since 2020, have been finally removed. They have not been actively maintenance in the last years, @@ -47,6 +60,10 @@ fuse client for `fallocate` for the default case (i.e. mode == 0) since CephFS does not support disk space reservation. The only flags supported are `FALLOC_FL_KEEP_SIZE` and `FALLOC_FL_PUNCH_HOLE`. +* pybind/rados: Fixes WriteOp.zero() in the original reversed order of arguments + `offset` and `length`. When pybind calls WriteOp.zero(), the argument passed + does not match rados_write_op_zero, and offset and length are swapped, which + results in an unexpected response. * The HeadBucket API now reports the `X-RGW-Bytes-Used` and `X-RGW-Object-Count` headers only when the `read-stats` querystring is explicitly included in the @@ -61,6 +78,9 @@ `ceph fs subvolume earmark set`, `ceph fs subvolume earmark get` and `ceph fs subvolume earmark rm` have been added to set, get and remove earmark from a given subvolume. +* RADOS: A performance botteneck in the balancer mgr module has been fixed. + Related Tracker: https://tracker.ceph.com/issues/68657 + >=19.0.0 * cephx: key rotation is now possible using `ceph auth rotate`. Previously, @@ -328,6 +348,8 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config * NFS: The export create/apply of CephFS based exports will now have a additional parameter `cmount_path` under the FSAL block, which specifies the path within the CephFS to mount this export on. If this and the other `EXPORT { FSAL {} }` options are the same between multiple exports, those exports will share a single CephFS client. If not specified, the default is `/`. +* CephFS: MDS emits a warning with estimated replay completion time when replay + runs for more than 30 seconds. >=18.0.0 diff --git a/README.md b/README.md index 56257697e9a..f8fcf35e8b7 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ To build Ceph, follow this procedure: ninja -j3 - > [IMPORTANT] + > [!IMPORTANT] > > [Ninja](https://ninja-build.org/) is the build system used by the Ceph > project to build test builds. The number of jobs used by `ninja` is @@ -126,6 +126,9 @@ To build Ceph, follow this procedure: 5. Install the vstart cluster: ninja install + + + ### CMake Options @@ -177,6 +180,36 @@ The diagnostic colors will be visible when the following command is run: Other available values for `DIAGNOSTICS_COLOR` are `auto` (default) and `never`. +## Tips and Tricks + + * Use "debug builds" only when needed. Debugging builds are helpful for + development, but they can slow down performance. Use + `-DCMAKE_BUILD_TYPE=Release` when debugging isn't necessary. + * Enable Selective Daemons when testing specific components. Don't start + unnecessary daemons. + * Preserve Existing Data skip cluster reinitialization between tests by + using the `-n` flag. + * To manage a vstart cluster, stop daemons using `./stop.sh` and start them + with `./vstart.sh --daemon osd.${ID} [--nodaemonize]`. + * Restart the sockets by stopping and restarting the daemons associated with + them. This ensures that there are no stale sockets in the cluster. + * To track RocksDB performance, set `export ROCKSDB_PERF=true` and start + the cluster by using the command `./vstart.sh -n -d -x --bluestore`. + * Build with `vstart-base` using debug flags in cmake, compile, and deploy + via `./vstart.sh -d -n --bluestore`. + * To containerize, generate configurations with `vstart.sh`, and deploy with + Docker, mapping directories and configuring the network. + * Manage containers using `docker run`, `stop`, and `rm`. For detailed + setups, consult the Ceph-Container repository. + +## Troubleshooting + + * Cluster Fails to Start: Look for errors in the logs under the `out/` + directory. + * OSD Crashes: Check the OSD logs for errors. + * Cluster in a `Health Error` State: Run the `ceph status` command to + identify the issue. + * RocksDB Errors: Look for RocksDB-related errors in the OSD logs. ## Building a source tarball diff --git a/container/Containerfile b/container/Containerfile index c954ebed1be..9a5a88e76a1 100644 --- a/container/Containerfile +++ b/container/Containerfile @@ -212,6 +212,7 @@ RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.tx # Set some envs in the container for quickly inspecting details about the build at runtime ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \ CEPH_REF="${CEPH_REF}" \ + CEPH_VERSION="${CEPH_REF}" \ CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \ FROM_IMAGE="${FROM_IMAGE}" diff --git a/debian/control b/debian/control index ec04c2599cd..a8c79f7a731 100644 --- a/debian/control +++ b/debian/control @@ -996,10 +996,11 @@ Package: librgw2 Architecture: linux-any Section: libs Depends: librados2 (= ${binary:Version}), + liblua5.3-0, ${misc:Depends}, ${shlibs:Depends}, - liblua5.3-dev, - luarocks, +Suggests: liblua5.3-dev, + luarocks, Description: RADOS Gateway client library RADOS is a distributed object store used by the Ceph distributed storage system. This package provides a REST gateway to the diff --git a/doc/_ext/ceph_commands.py b/doc/_ext/ceph_commands.py index 0697c71f0e1..d96eab08853 100644 --- a/doc/_ext/ceph_commands.py +++ b/doc/_ext/ceph_commands.py @@ -94,7 +94,7 @@ class CmdParam(object): self.goodchars = goodchars self.positional = positional != 'false' - assert who == None + assert who is None def help(self): advanced = [] diff --git a/doc/cephadm/services/osd.rst b/doc/cephadm/services/osd.rst index 831bd238c79..90ebd86f897 100644 --- a/doc/cephadm/services/osd.rst +++ b/doc/cephadm/services/osd.rst @@ -198,6 +198,18 @@ There are a few ways to create new OSDs: .. warning:: When deploying new OSDs with ``cephadm``, ensure that the ``ceph-osd`` package is not already installed on the target host. If it is installed, conflicts may arise in the management and control of the OSD that may lead to errors or unexpected behavior. +* OSDs created via ``ceph orch daemon add`` are by default not added to the orchestrator's OSD service, they get added to 'osd' service. To attach an OSD to a different, existing OSD service, issue a command of the following form: + + .. prompt:: bash * + + ceph orch osd set-spec-affinity <service_name> <osd_id(s)> + + For example: + + .. prompt:: bash # + + ceph orch osd set-spec-affinity osd.default_drive_group 0 1 + Dry Run ------- diff --git a/doc/cephadm/services/rgw.rst b/doc/cephadm/services/rgw.rst index ed0b149365a..3df8ed2fc56 100644 --- a/doc/cephadm/services/rgw.rst +++ b/doc/cephadm/services/rgw.rst @@ -173,6 +173,32 @@ Then apply this yaml document: Note the value of ``rgw_frontend_ssl_certificate`` is a literal string as indicated by a ``|`` character preserving newline characters. +Disabling multisite sync traffic +-------------------------------- + +There is an RGW config option called ``rgw_run_sync_thread`` that tells the +RGW daemon to not transmit multisite replication data. This is useful if you want +that RGW daemon to be dedicated to I/O rather than multisite sync operations. +The RGW spec file includes a setting ``disable_multisite_sync_traffic`` that when +set to "True" will tell cephadm to set ``rgw_run_sync_thread`` to false for all +RGW daemons deployed for that RGW service. For example + +.. code-block:: yaml + + service_type: rgw + service_id: foo + placement: + label: rgw + spec: + rgw_realm: myrealm + rgw_zone: myzone + rgw_zonegroup: myzg + disable_multisite_sync_traffic: True + +.. note:: This will only stop the RGW daemon(s) from sending replication data. + The daemon can still receive replication data unless it has been removed + from the zonegroup and zone replication endpoints. + Service specification --------------------- diff --git a/doc/cephfs/disaster-recovery-experts.rst b/doc/cephfs/disaster-recovery-experts.rst index 7677b42f47e..b01a3dfde6a 100644 --- a/doc/cephfs/disaster-recovery-experts.rst +++ b/doc/cephfs/disaster-recovery-experts.rst @@ -21,43 +21,46 @@ Advanced: Metadata repair tools Journal export -------------- -Before attempting dangerous operations, make a copy of the journal like so: +Before attempting any dangerous operation, make a copy of the journal by +running the following command: -:: +.. prompt:: bash # - cephfs-journal-tool journal export backup.bin + cephfs-journal-tool journal export backup.bin -Note that this command may not always work if the journal is badly corrupted, -in which case a RADOS-level copy should be made (http://tracker.ceph.com/issues/9902). +If the journal is badly corrupted, this command might not work. If the journal +is badly corrupted, make a RADOS-level copy +(http://tracker.ceph.com/issues/9902). Dentry recovery from journal ---------------------------- If a journal is damaged or for any reason an MDS is incapable of replaying it, -attempt to recover what file metadata we can like so: +attempt to recover file metadata by running the following command: -:: +.. prompt:: bash # - cephfs-journal-tool event recover_dentries summary + cephfs-journal-tool event recover_dentries summary -This command by default acts on MDS rank 0, pass --rank=<n> to operate on other ranks. +By default, this command acts on MDS rank ``0``. Pass the option ``--rank=<n>`` +to the ``cephfs-journal-tool`` command to operate on other ranks. -This command will write any inodes/dentries recoverable from the journal -into the backing store, if these inodes/dentries are higher-versioned -than the previous contents of the backing store. If any regions of the journal -are missing/damaged, they will be skipped. +This command writes all inodes and dentries recoverable from the journal into +the backing store, but only if these inodes and dentries are higher-versioned +than the existing contents of the backing store. Any regions of the journal +that are missing or damaged will be skipped. -Note that in addition to writing out dentries and inodes, this command will update -the InoTables of each 'in' MDS rank, to indicate that any written inodes' numbers -are now in use. In simple cases, this will result in an entirely valid backing +In addition to writing out dentries and inodes, this command updates the +InoTables of each ``in`` MDS rank, to indicate that any written inodes' numbers +are now in use. In simple cases, this will result in an entirely valid backing store state. .. warning:: - The resulting state of the backing store is not guaranteed to be self-consistent, - and an online MDS scrub will be required afterwards. The journal contents - will not be modified by this command, you should truncate the journal + The resulting state of the backing store is not guaranteed to be + self-consistent, and an online MDS scrub will be required afterwards. The + journal contents will not be modified by this command. Truncate the journal separately after recovering what you can. Journal truncation diff --git a/doc/cephfs/health-messages.rst b/doc/cephfs/health-messages.rst index 0f171c6ccc9..7aa1f2e44ee 100644 --- a/doc/cephfs/health-messages.rst +++ b/doc/cephfs/health-messages.rst @@ -269,3 +269,11 @@ other daemons, please see :ref:`health-checks`. To evict and permanently block broken clients from connecting to the cluster, set the ``required_client_feature`` bit ``client_mds_auth_caps``. + +``MDS_ESTIMATED_REPLAY_TIME`` +----------------------------- + Message + "HEALTH_WARN Replay: x% complete. Estimated time remaining *x* seconds + + Description + When an MDS journal replay takes more than 30 seconds, this message indicates the estimated time to completion. diff --git a/doc/cephfs/index.rst b/doc/cephfs/index.rst index 57ea336c00b..630d29f1956 100644 --- a/doc/cephfs/index.rst +++ b/doc/cephfs/index.rst @@ -93,6 +93,7 @@ Administration CephFS Top Utility <cephfs-top> Scheduled Snapshots <snap-schedule> CephFS Snapshot Mirroring <cephfs-mirroring> + Purge Queue <purge-queue> .. raw:: html @@ -147,6 +148,7 @@ CephFS Concepts LazyIO <lazyio> Directory fragmentation <dirfrags> Multiple active MDS daemons <multimds> + Snapshots <snapshots> .. raw:: html diff --git a/doc/cephfs/purge-queue.rst b/doc/cephfs/purge-queue.rst new file mode 100644 index 00000000000..d7a68e7fa55 --- /dev/null +++ b/doc/cephfs/purge-queue.rst @@ -0,0 +1,106 @@ +============ +Purge Queue +============ + +MDS maintains a data structure known as **Purge Queue** which is responsible +for managing and executing the parallel deletion of files. +There is a purge queue for every MDS rank. Purge queues consist of purge items +which contain nominal information from the inodes such as size and the layout +(i.e. all other un-needed metadata information is discarded making it +independent of all metadata structures). + +Deletion process +================ + +When a client requests deletion of a directory (say ``rm -rf``): + +- MDS queues the files and subdirectories (purge items) from pq (purge queue) + journal in the purge queue. +- Processes the deletion of inodes in background in small and manageable + chunks. +- MDS instructs underlying OSDs to clean up the associated objects in data + pool. +- Updates the journal. + +.. note:: If the users delete the files more quickly than the + purge queue can process then the data pool usage might increase + substantially over time. In extreme scenarios, the purge queue + backlog can become so huge that it can slacken the capacity reclaim + and the linux ``du`` command for CephFS might report inconsistent + data compared to the CephFS Data pool. + +There are a few tunable configs that MDS uses internally to throttle purge +queue processing: + +.. confval:: filer_max_purge_ops +.. confval:: mds_max_purge_files +.. confval:: mds_max_purge_ops +.. confval:: mds_max_purge_ops_per_pg + +Generally, the defaults are adequate for most clusters. However, in +case of pretty huge clusters, if the need arises like ``pq_item_in_journal`` +(counter of things pending deletion) reaching gigantic figure then the configs +can be tuned to 4-5 times of the default value as a starting point and +further increments are subject to more requirements. + +Start from the most trivial config ``filer_max_purge_ops``, which should help +reclaim the space more quickly:: + + $ ceph config set mds filer_max_purge_ops 40 + +Incrementing ``filer_max_purge_ops`` should just work for most +clusters but if it doesn't then move ahead with tuning other configs:: + + $ ceph config set mds mds_max_purge_files 256 + $ ceph config set mds mds_max_purge_ops 32768 + $ ceph config set mds mds_max_purge_ops_per_pg 2 + +.. note:: Setting these values won't immediately break anything except + inasmuch as they control how many delete ops we issue to the + underlying RADOS cluster, but might eat up some cluster performance + if the values set are staggeringly high. + +.. note:: The purge queue is not an auto-tuning system in terms of its work + limits as compared to what is going on. So it is advised to make + a conscious decision while tuning the configs based on the cluster + size and workload. + +Examining purge queue perf counters +=================================== + +When analysing MDS perf dumps, the purge queue statistics look like:: + + "purge_queue": { + "pq_executing_ops": 56655, + "pq_executing_ops_high_water": 65350, + "pq_executing": 1, + "pq_executing_high_water": 3, + "pq_executed": 25, + "pq_item_in_journal": 6567004 + } + +Let us understand what each of these means: + +.. list-table:: + :widths: 50 50 + :header-rows: 1 + + * - Name + - Description + * - pq_executing_ops + - Purge queue operations in flight + * - pq_executing_ops_high_water + - Maximum number of executing purge operations recorded + * - pq_executing + - Purge queue files being deleted + * - pq_executing_high_water + - Maximum number of executing file purges + * - pq_executed + - Purge queue files deleted + * - pq_item_in_journal + - Purge items (files) left in journal + +.. note:: ``pq_executing`` and ``pq_executing_ops`` might look similar but + there is a small nuance. ``pq_executing`` tracks number of files + in the purge queue while ``pq_executing_ops`` is the count of RADOS + objects from all the files in purge queue. diff --git a/doc/cephfs/snap-schedule.rst b/doc/cephfs/snap-schedule.rst index a94d938040f..48e79047864 100644 --- a/doc/cephfs/snap-schedule.rst +++ b/doc/cephfs/snap-schedule.rst @@ -197,6 +197,15 @@ this happens, the next snapshot will be schedule as if the previous one was not delayed, i.e. one or more delayed snapshots will not cause drift in the overall schedule. +If a volume is deleted while snapshot schedules are active on the volume, then +there might be cases when Python Tracebacks are seen in the log file or on the +command-line when commands are executed on such volumes. Although measures have +been taken to take note of the fs_map changes and delete active timers and +close database connections to avoid Python Tracebacks, it is not possible to +completely mute the tracebacks due to the inherent nature of problem. In the +event that such tracebacks are seen, the only solution to get the system to a +stable state is the disable and re-enable the snap_schedule Manager Module. + In order to somewhat limit the overall number of snapshots in a file system, the module will only keep a maximum of 50 snapshots per directory. If the retention policy results in more then 50 retained snapshots, the retention list will be diff --git a/doc/cephfs/snapshots.rst b/doc/cephfs/snapshots.rst new file mode 100644 index 00000000000..a60be96ed53 --- /dev/null +++ b/doc/cephfs/snapshots.rst @@ -0,0 +1,85 @@ +================ +CephFS Snapshots +================ + +CephFS snapshots create an immutable view of the file system at the point +in time they are taken. CephFS support snapshots which is managed in a +special hidden subdirectory named ``.snap`` .Snapshots are created using +``mkdir`` inside this directory. + +Snapshots can be exposed with a different name by changing the following client configurations. + +- ``snapdirname`` which is a mount option for kernel clients +- ``client_snapdir`` which is a mount option for ceph-fuse. + +Snapshot Creation +================== + +CephFS snapshot feature is enabled by default on new file systems. To enable +it on existing file systems, use the command below. + +.. code-block:: bash + + $ ceph fs set <fs_name> allow_new_snaps true + +When snapshots are enabled, all directories in CephFS will have a special ``.snap`` +directory. (You may configure a different name with the client snapdir setting if +you wish.) +To create a CephFS snapshot, create a subdirectory under ``.snap`` with a name of +your choice. +For example, to create a snapshot on directory ``/file1/``, invoke ``mkdir /file1/.snap/snapshot-name`` + +.. code-block:: bash + + $ touch file1 + $ cd .snap + $ mkdir my_snapshot + +Using snapshot to recover data +=============================== + +Snapshots can also be used to recover some deleted files. + +- ``create a file1 and create snapshot snap1`` + +.. code-block:: bash + + $ touch /mnt/cephfs/file1 + $ cd .snap + $ mkdir snap1 + +- ``create a file2 and create snapshot snap2`` + +.. code-block:: bash + + $ touch /mnt/cephfs/file2 + $ cd .snap + $ mkdir snap2 + +- ``delete file1 and create a new snapshot snap3`` + +.. code-block:: bash + + $ rm /mnt/cephfs/file1 + $ cd .snap + $ mkdir snap3 + +- ``recover file1 using snapshot snap2 using cp command`` + +.. code-block:: bash + + $ cd .snap + $ cd snap2 + $ cp file1 /mnt/cephfs/ + +Snapshot Deletion +================== + +Snapshots are deleted by invoking ``rmdir`` on the ``.snap`` directory they are +rooted in. (Attempts to delete a directory which roots the snapshots will fail; +you must delete the snapshots first.) + +.. code-block:: bash + + $ cd .snap + $ rmdir my_snapshot diff --git a/doc/conf.py b/doc/conf.py index 4fdc9a53b75..5293ff1b212 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -76,7 +76,7 @@ html_show_sphinx = False html_static_path = ["_static"] html_sidebars = { '**': ['smarttoc.html', 'searchbox.html'] - } +} html_css_files = ['css/custom.css'] @@ -133,13 +133,23 @@ extensions = [ 'sphinxcontrib.mermaid', 'sphinxcontrib.openapi', 'sphinxcontrib.seqdiag', - ] +] ditaa = shutil.which("ditaa") if ditaa is not None: # in case we don't have binfmt_misc enabled or jar is not registered - ditaa_args = ['-jar', ditaa] - ditaa = 'java' + _jar_paths = [ + '/usr/share/ditaa/lib/ditaa.jar', # Gentoo + '/usr/share/ditaa/ditaa.jar', # deb + '/usr/share/java/ditaa.jar', # rpm + ] + _jar_paths = [p for p in _jar_paths if os.path.exists(p)] + if _jar_paths: + ditaa = 'java' + ditaa_args = ['-jar', _jar_paths[0]] + else: + # keep ditaa from shutil.which + ditaa_args = [] extensions += ['sphinxcontrib.ditaa'] else: extensions += ['plantweb.directive'] diff --git a/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst b/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst index 34dfd521eaa..6964012ef31 100644 --- a/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst +++ b/doc/dev/developer_guide/testing_integration_tests/tests-integration-testing-teuthology-workflow.rst @@ -6,7 +6,8 @@ Integration Tests using Teuthology Workflow Infrastructure -------------- -Components: +Components +********** 1. `ceph-ci`_: Clone of the main Ceph repository, used for triggering Jenkins Ceph builds for development. @@ -44,7 +45,27 @@ Components: Each Teuthology test *run* contains multiple test *jobs*. Each job runs in an environment isolated from other jobs, on a different collection of test nodes. -To test a change in Ceph, follow these steps: +Workflow Overview +***************** + +.. image:: workflow.png + + +To test a change in Ceph, start by pushing a branch with your changes to the +`ceph-ci`_ repository. This will automatically trigger the Jenkins process +to build Ceph binaries - the status of the build can be observed on `Shaman`_. +These built packages will be uploaded on `Chacra`_. + +To schedule a Teuthology integration test against this new build, you will +need access to the Sepia lab. Once you have access, log into the Teuthology +machine and complete the one-time initial Teuthology setup required to run +Teuthology commands. After the setup, use the ``teuthology-suite`` command to schedule +a Teuthology run. In this command, use the ``-c <ceph-ci branch name>`` option to +specify your build. The results of your test can be observed on `Pulpito`_. +Log into a `developer playground machine`_ to review the Teuthology run's archive logs. + + +The rest of the document will explain these steps in detail: 1. Getting binaries - Build Ceph. 2. Scheduling Test Run: @@ -98,6 +119,31 @@ Ceph binaries must be built for your branch before you can use teuthology to run .. _the Chacra site: https://shaman.ceph.com/api/search/?status=ready&project=ceph +Pushing to the ceph-ci repository +********************************* + +Follow these steps to push to the ceph-ci repository. After pushing, a new build will +automatically be scheduled. + +1. Add the ceph-ci repository as a remote to your local clone of the Ceph repository: + + .. prompt:: bash $ + + git remote add ceph-ci git@github.com:ceph/ceph-ci.git + + $ git remote -v + origin git@github.com:ceph/ceph.git (fetch) + origin git@github.com:ceph/ceph.git (push) + ceph-ci git@github.com:ceph/ceph-ci.git (fetch) + ceph-ci git@github.com:ceph/ceph-ci.git (push) + +2. Push your branch upstream by running a command of the following form: + + .. prompt:: bash $ + + $ git push ceph-ci wip-yourname-feature-x + + Naming the ceph-ci branch ************************* Prepend your branch with your name before you push it to ceph-ci. For example, @@ -110,15 +156,14 @@ the name of that stable branch in your ceph-ci branch name. For example, the ``feature-x`` PR branch should be named ``wip-feature-x-nautilus``. *This is not just a convention. This ensures that your branch is built in the correct environment.* -You can choose to only trigger a CentOS 9.Stream build (excluding other distro like ubuntu) -by adding "centos9-only" at the end of the ceph-ci branch name. For example, -``wip-$yourname-feature-centos9-only``. This helps to get quicker builds and save resources -when you don't require binaries for other distros. - Delete the branch from ceph-ci when you no longer need it. If you are logged in to GitHub, all your branches on ceph-ci can be found here: https://github.com/ceph/ceph-ci/branches. +.. note:: You can choose to only trigger a CentOS 9.Stream build (excluding other + distro like ubuntu) by adding "centos9-only" at the end of the ceph-ci branch name. + For example, ``wip-$yourname-feature-centos9-only``. This helps to get quicker builds + and save resources when you don't require binaries for other distros. Scheduling Test Run ------------------- diff --git a/doc/dev/developer_guide/testing_integration_tests/workflow.png b/doc/dev/developer_guide/testing_integration_tests/workflow.png Binary files differnew file mode 100644 index 00000000000..610baf683bc --- /dev/null +++ b/doc/dev/developer_guide/testing_integration_tests/workflow.png diff --git a/doc/man/8/cephadm.rst b/doc/man/8/cephadm.rst index b2cad6cb505..3c23a9867f7 100644 --- a/doc/man/8/cephadm.rst +++ b/doc/man/8/cephadm.rst @@ -13,7 +13,7 @@ Synopsis | [--log-dir LOG_DIR] [--logrotate-dir LOGROTATE_DIR] | [--unit-dir UNIT_DIR] [--verbose] [--timeout TIMEOUT] | [--retry RETRY] [--no-container-init] -| {version,pull,inspect-image,ls,list-networks,adopt,rm-daemon,rm-cluster,run,shell,enter,ceph-volume,unit,logs,bootstrap,deploy,check-host,prepare-host,add-repo,rm-repo,install,list-images} +| {version,pull,inspect-image,ls,list-networks,adopt,rm-daemon,rm-cluster,run,shell,enter,ceph-volume,unit,logs,bootstrap,deploy,check-host,prepare-host,add-repo,rm-repo,install,list-images,update-osd-service} | ... @@ -106,6 +106,7 @@ Synopsis | **cephadm** **list-images** +| **cephadm** **update-osd-service** [-h] [--fsid FSID] --osd-ids OSD_IDS --service-name SERVICE_NAME Description @@ -535,6 +536,18 @@ list-images List the default container images for all services in ini format. The output can be modified with custom images and passed to --config flag during bootstrap. +update-osd-service +------------------ + +Update the OSD service for specific OSDs + +Arguments: + +* [--fsid FSID] cluster FSID +* --osd-ids OSD_IDS Comma-separated OSD IDs +* --service-name SERVICE_NAME OSD service name + + Availability ============ diff --git a/doc/man/8/radosgw-admin.rst b/doc/man/8/radosgw-admin.rst index c7750c348ad..3cd4338a5ec 100644 --- a/doc/man/8/radosgw-admin.rst +++ b/doc/man/8/radosgw-admin.rst @@ -541,6 +541,13 @@ Options Generate random secret key. +.. option:: --generate-key + + create user with or without credentials. + If this option set to false, then user cannot set --gen-access-key/--gen-secret/--secret-key/--access-key. + If this option set to true, then user cannot set --secret-key/--access-key and bypass options for --gen-secret/--gen-access-key. + Default is true. + .. option:: --key-type=<type> Key type, options are: swift, s3. diff --git a/doc/mgr/dashboard.rst b/doc/mgr/dashboard.rst index b0448bd0eef..32824fab4b5 100644 --- a/doc/mgr/dashboard.rst +++ b/doc/mgr/dashboard.rst @@ -1310,9 +1310,9 @@ redirection on standby nodes. mode tcp option httpchk GET / http-check expect status 200 - server x <HOST>:<PORT> ssl check verify none - server y <HOST>:<PORT> ssl check verify none - server z <HOST>:<PORT> ssl check verify none + server x <HOST>:<PORT> check check-ssl verify none + server y <HOST>:<PORT> check check-ssl verify none + server z <HOST>:<PORT> check check-ssl verify none .. _dashboard-auditing: diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index f5d38948150..a1498a09fd0 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -1665,6 +1665,14 @@ Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed, the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information. +NVMEOF_GATEWAY_DELETING +_______________________ + +Some of the gateways are in the GW_DELETING state. They will stay in this +state until all the namespaces under the gateway's load balancing group are +moved to another load balancing group ID. This is done automatically by the +load balancing process. If this alert persist for a long time, there might +be an issue with that process. Miscellaneous ------------- diff --git a/doc/rados/operations/stretch-mode.rst b/doc/rados/operations/stretch-mode.rst index e8be5e13e6a..7a4fa46117d 100644 --- a/doc/rados/operations/stretch-mode.rst +++ b/doc/rados/operations/stretch-mode.rst @@ -119,13 +119,29 @@ See https://tracker.ceph.com/issues/68338 for more information. Stretch Mode ============ -Stretch mode is designed to handle deployments in which you cannot guarantee the -replication of data across two data centers. This kind of situation can arise -when the cluster's CRUSH rule specifies that three copies are to be made, but -then a copy is placed in each data center with a ``min_size`` of 2. Under such -conditions, a placement group can become active with two copies in the first -data center and no copies in the second data center. +Stretch mode is designed to handle netsplit scenarios between two data zones as well +as the loss of one data zone. It handles the netsplit scenario by choosing the surviving zone +that has the better connection to the ``tiebreaker monitor``. It handles the loss of one zone by +reducing the ``size`` to ``2`` and ``min_size`` to ``1``, allowing the cluster to continue operating +with the remaining zone. When the lost zone comes back, the cluster will recover the lost data +and return to normal operation. + +Connectivity Monitor Election Strategy +--------------------------------------- +When using stretch mode, the monitor election strategy must be set to ``connectivity``. +This strategy tracks network connectivity between the monitors and is +used to determine which zone should be favored when the cluster is in a netsplit scenario. + +See `Changing Monitor Elections`_ + +Stretch Peering Rule +-------------------- +One critical behavior of stretch mode is its ability to prevent a PG from going active if the acting set +contains only replicas from a single zone. This safeguard is crucial for mitigating the risk of data +loss during site failures because if a PG were allowed to go active with replicas only in a single site, +writes could be acknowledged despite a lack of redundancy. In the event of a site failure, all data in the +affected PG would be lost. Entering Stretch Mode --------------------- @@ -271,7 +287,7 @@ possible, if needed). .. _Changing Monitor elections: ../change-mon-elections Exiting Stretch Mode -===================== +-------------------- To exit stretch mode, run the following command: .. prompt:: bash $ diff --git a/doc/radosgw/account.rst b/doc/radosgw/account.rst index 6dab997d93e..0e4ede5a50a 100644 --- a/doc/radosgw/account.rst +++ b/doc/radosgw/account.rst @@ -174,6 +174,11 @@ An existing user can be adopted into an account with ``user modify``:: .. note:: Account membership is permanent. Once added, users cannot be removed from their account. +.. note:: The IAM User API imposes additional requirements on the format + of ``UserName``, which is enforced when migrating users into an account. + If migration fails with "UserName contains invalid characters", the + ``--display-name`` should be modified to match ``[\w+=,.@-]+``. + .. warning:: Ownership of the user's notification topics will not be transferred to the account. Notifications will continue to work, but the topics will no longer be visible to SNS Topic APIs. Topics and diff --git a/doc/radosgw/admin.rst b/doc/radosgw/admin.rst index 7c7d9d6df14..8dbf8c10b04 100644 --- a/doc/radosgw/admin.rst +++ b/doc/radosgw/admin.rst @@ -262,6 +262,7 @@ include: - ``--secret-key=<key>`` manually specifies a S3 secret key or a Swift secret key. - ``--gen-access-key`` automatically generates a random S3 access key. - ``--gen-secret`` automatically generates a random S3 secret key or a random Swift secret key. +- ``--generate-key`` create user with or without credentials. If sets to false, then user cannot set ``gen-secret/gen-access-key/access-key/secret-key`` Adding S3 keys ~~~~~~~~~~~~~~ diff --git a/doc/radosgw/bucket_logging.rst b/doc/radosgw/bucket_logging.rst index cb9f8465d20..f3e790f5705 100644 --- a/doc/radosgw/bucket_logging.rst +++ b/doc/radosgw/bucket_logging.rst @@ -15,6 +15,12 @@ The log bucket can accumulate logs from multiple buckets. It is recommended to c a different "prefix" for each bucket, so that the logs of different buckets will be stored in different objects in the log bucket. +.. note:: + + - The log bucket must be created before enabling logging on a bucket + - The log bucket cannot be the same as the bucket being logged + - The log bucket cannot have logging enabled on it + .. toctree:: :maxdepth: 1 @@ -29,6 +35,7 @@ Adding a log object to the log bucket is done "lazily", meaning, that if no more remain outside of the log bucket even after the configured time has passed. To counter that, you can flush all logging objects on a given source bucket to log them, regardless if enough time passed or if no more records are written to the object. +Flushing will happen automatically when logging is disabled on a bucket, its logging configuration is changed, or the bucket is deleted. Standard ```````` @@ -72,7 +79,7 @@ has the following format: :: - <prefix><bucket owner>/<source region>/<bucket name>/<year>/<month>/<day>/<year-month-day-hour-minute-second>-<16 bytes unique-id> + <prefix><bucket owner>/<source region>/[tenant:]<bucket name>/<year>/<month>/<day>/<year-month-day-hour-minute-second>-<16 bytes unique-id> For example: @@ -90,7 +97,7 @@ Journal minimum amount of data used for journaling bucket changes (this is a Ceph extension). - bucket owner (or dash if empty) - - bucket name (or dash if empty) + - bucket name (or dash if empty). in the format: ``[tenant:]<bucket name>`` - time in the following format: ``[day/month/year:hour:minute:second timezone]`` - object key (or dash if empty) - operation in the following format: ``WEBSITE/REST.<HTTP method>.<resource>`` @@ -111,7 +118,7 @@ Standard based on `AWS Logging Record Format`_. - bucket owner (or dash if empty) - - bucket name (or dash if empty) + - bucket name (or dash if empty). in the format: ``[tenant:]<bucket name>`` - time - remote IP (not supported, always a dash) - user or account (or dash if empty) diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst index edc6a90b0f9..405bc727208 100644 --- a/doc/radosgw/config-ref.rst +++ b/doc/radosgw/config-ref.rst @@ -75,10 +75,11 @@ aggressiveness of lifecycle processing: .. confval:: rgw_lc_max_wp_worker These values can be tuned based upon your specific workload to further increase the -aggressiveness of lifecycle processing. For a workload with a larger number of buckets (thousands) -you would look at increasing the :confval:`rgw_lc_max_worker` value from the default value of 3 whereas for a -workload with a smaller number of buckets but higher number of objects (hundreds of thousands) -per bucket you would consider decreasing :confval:`rgw_lc_max_wp_worker` from the default value of 3. +aggressiveness of lifecycle processing. For a workload with a large number of buckets (thousands) +you would raise the number of workers by increasing :confval:`rgw_lc_max_worker` +from the default value of 3. Whereas for a workload with a higher number of objects per bucket +(hundreds of thousands) you would raise the number of parallel threads +by increasing :confval:`rgw_lc_max_wp_worker` from the default value of 3. .. note:: When looking to tune either of these specific values please validate the current Cluster performance and Ceph Object Gateway utilization before increasing. diff --git a/doc/radosgw/notifications.rst b/doc/radosgw/notifications.rst index 05653956be1..897c280facf 100644 --- a/doc/radosgw/notifications.rst +++ b/doc/radosgw/notifications.rst @@ -188,6 +188,7 @@ updating, use the name of an existing topic and different endpoint values). [&Attributes.entry.15.key=Policy&Attributes.entry.15.value=<policy-JSON-string>] [&Attributes.entry.16.key=user-name&Attributes.entry.16.value=<user-name-string>] [&Attributes.entry.17.key=password&Attributes.entry.17.value=<password-string>] + [&Attributes.entry.18.key=kafka-brokers&Attributes.entry.18.value=<kafka-broker-list>] Request parameters: @@ -296,6 +297,8 @@ Request parameters: - "broker": Messages are considered "delivered" if acked by the broker. (This is the default.) + - kafka-brokers: A command-separated list of host:port of kafka brokers. These brokers (may contain a broker which is defined in kafka uri) will be added to kafka uri to support sending notifcations to a kafka cluster. + .. note:: - The key-value pair of a specific parameter need not reside in the same @@ -571,6 +574,7 @@ Valid AttributeName that can be passed: - mechanism: may be provided together with user/password (default: ``PLAIN``). - kafka-ack-level: No end2end acknowledgement is required. Messages may persist in the broker before being delivered to their final destinations. + - kafka-brokers: Set endpoint with broker(s) as a comma-separated list of host or host:port (default port 9092). Notifications ~~~~~~~~~~~~~ diff --git a/doc/radosgw/s3/objectops.rst b/doc/radosgw/s3/objectops.rst index 2ac52607fe3..ddc5fb910c4 100644 --- a/doc/radosgw/s3/objectops.rst +++ b/doc/radosgw/s3/objectops.rst @@ -115,7 +115,7 @@ Request Headers +---------------------------+------------------------------------------------+--------------------------------+------------+ | **if-match** | Gets only if object ETag matches ETag. | Entity Tag | No | +---------------------------+------------------------------------------------+--------------------------------+------------+ -| **if-none-match** | Gets only if object ETag matches ETag. | Entity Tag | No | +| **if-none-match** | Gets only if object ETag doesn't match. | Entity Tag | No | +---------------------------+------------------------------------------------+--------------------------------+------------+ Response Headers @@ -155,7 +155,7 @@ Request Headers +---------------------------+------------------------------------------------+--------------------------------+------------+ | **if-match** | Gets only if object ETag matches ETag. | Entity Tag | No | +---------------------------+------------------------------------------------+--------------------------------+------------+ -| **if-none-match** | Gets only if object ETag matches ETag. | Entity Tag | No | +| **if-none-match** | Gets only if object ETag doesn't match | Entity Tag | No | +---------------------------+------------------------------------------------+--------------------------------+------------+ Get Object ACL diff --git a/doc/radosgw/uadk-accel.rst b/doc/radosgw/uadk-accel.rst index 0ed25148d73..aaafe1c21df 100644 --- a/doc/radosgw/uadk-accel.rst +++ b/doc/radosgw/uadk-accel.rst @@ -2,9 +2,9 @@ UADK Acceleration for Compression =============================================== -UADK is a framework for applications to access hardware accelerators in a -unified, secure, and efficient way. UADK is comprised of UACCE, libwd and many -other algorithm libraries. +UADK is a framework that makes it possible for applications to access hardware +accelerators in a unified, secure, and efficient way. UADK is comprised of +UACCE, libwd, and many other algorithm libraries. See `Compressor UADK Support`_. @@ -21,13 +21,13 @@ which enables hardware accelerators that support SVA to adapt to UADK. Currently, HiSilicon Kunpeng hardware accelerators have been registered with UACCE. Through the UADK framework, users can run cryptographic and compression -algorithms using hardware accelerators instead of CPUs, freeing up CPU computing -power and improving computing performance. +algorithms using hardware accelerators instead of CPUs, which frees up CPU +computing power and improves computing performance. -A user can access the hardware accelerators by performing user-mode operations on -the character devices, or the use of UADK can be done via frameworks that have -been enabled by others including UADK support (for example, OpenSSL* libcrypto*, -DPDK, and the Linux* Kernel Crypto Framework). +Users can access the hardware accelerators by performing user-mode operations +on the character devices, or the use of UADK can be achieved via frameworks +that have been enabled by others including UADK support (for example, OpenSSL* +libcrypto*, DPDK, and the Linux* Kernel Crypto Framework). See `OpenSSL UADK Engine`_. diff --git a/doc/releases/index.rst b/doc/releases/index.rst index a8015c65465..1393770878f 100644 --- a/doc/releases/index.rst +++ b/doc/releases/index.rst @@ -23,7 +23,6 @@ security fixes. Squid (v19.2.*) <squid> Reef (v18.2.*) <reef> - Quincy (v17.2.*) <quincy> .. ceph_releases:: releases.yml current @@ -40,6 +39,7 @@ receive bug fixes or backports). :maxdepth: 1 :hidden: + Quincy (v17.2.*) <quincy> Pacific (v16.2.*) <pacific> Octopus (v15.2.*) <octopus> Nautilus (v14.2.*) <nautilus> diff --git a/doc/releases/releases.yml b/doc/releases/releases.yml index 948f9eab278..6a76cc7c92c 100644 --- a/doc/releases/releases.yml +++ b/doc/releases/releases.yml @@ -32,6 +32,7 @@ releases: quincy: target_eol: 2024-06-01 + actual_eol: 2025-01-13 releases: - version: 17.2.8 released: 2024-11-25 diff --git a/doc/start/hardware-recommendations.rst b/doc/start/hardware-recommendations.rst index 3c3c781a815..3d5e44d8e02 100644 --- a/doc/start/hardware-recommendations.rst +++ b/doc/start/hardware-recommendations.rst @@ -311,7 +311,7 @@ media cost. Moreover, when using NVMe SSDs, you do not need *any* HBA. This additionally reduces the HDD vs SSD cost gap when the system as a whole is considered. The initial cost of a fancy RAID HBA plus onboard cache plus battery backup (BBU or supercapacitor) can easily exceed more than 1000 US -dollars even after discounts - a sum that goes a log way toward SSD cost parity. +dollars even after discounts - a sum that goes a long way toward SSD cost parity. An HBA-free system may also cost hundreds of US dollars less every year if one purchases an annual maintenance contract or extended warranty. diff --git a/examples/rgw/boto3/head_bucket_stats.py b/examples/rgw/boto3/head_bucket_stats.py new file mode 100755 index 00000000000..1de40d63f4a --- /dev/null +++ b/examples/rgw/boto3/head_bucket_stats.py @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import boto3 +import sys + +if len(sys.argv) != 2: + print('Usage: ' + sys.argv[0] + ' <bucket>') + sys.exit(1) + +# bucket name as first argument +bucketname = sys.argv[1] + +# endpoint and keys from vstart +endpoint = 'http://127.0.0.1:8000' +access_key='0555b35654ad1656d804' +secret_key='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==' + +client = boto3.client('s3', + endpoint_url=endpoint, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key) + +# reading bucket stats via HeadBucket + +response = client.head_bucket(Bucket=bucketname, ReadStats=True) + +print('Objects:', response['ObjectCount'], 'Bytes:', response['BytesUsed']) diff --git a/examples/rgw/boto3/service-2.sdk-extras.json b/examples/rgw/boto3/service-2.sdk-extras.json index b81667ecd09..4618543d61b 100644 --- a/examples/rgw/boto3/service-2.sdk-extras.json +++ b/examples/rgw/boto3/service-2.sdk-extras.json @@ -379,7 +379,36 @@ }, "documentation":"<p>A filter for all log object. Filter for the object by its key (prefix, suffix and regex).</p>", "locationName":"Filter" - } + }, + "HeadBucketRequest": { + "members": { + "ReadStats":{ + "shape":"ReadStats", + "documentation":"<p>Read additional usage statistics for <code>ObjectCount</code> and <code>BytesUsed</code> in the response.</p> <note> <p>This request parameter is a Ceph RGW extension.</p> </note>", + "location":"querystring", + "locationName":"read-stats" + } + } + }, + "HeadBucketOutput":{ + "members":{ + "ObjectCount":{ + "shape":"ObjectCount", + "documentation": "<p>Total number of objects/versions in the bucket.</p>", + "location": "header", + "locationName": "x-rgw-object-count" + }, + "BytesUsed":{ + "shape":"BytesUsed", + "documentation": "<p>Total size in bytes of all objects/versions in the bucket.</p>", + "location": "header", + "locationName": "x-rgw-bytes-used" + } + } + }, + "ReadStats":{"type":"boolean"}, + "ObjectCount":{"type":"integer"}, + "BytesUsed":{"type":"integer"} }, "documentation":"<p/>" } diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet index a15b88422fc..e917b4c2dac 100644 --- a/monitoring/ceph-mixin/config.libsonnet +++ b/monitoring/ceph-mixin/config.libsonnet @@ -9,12 +9,12 @@ CephNodeNetworkPacketDropsPerSec: 10, CephRBDMirrorImageTransferBandwidthThreshold: 0.8, CephRBDMirrorImagesPerDaemonThreshold: 100, - NVMeoFMaxGatewaysPerGroup: 4, - NVMeoFMaxGatewaysPerCluster: 4, + NVMeoFMaxGatewaysPerGroup: 8, + NVMeoFMaxGatewaysPerCluster: 32, NVMeoFHighGatewayCPU: 80, NVMeoFMaxSubsystemsPerGateway: 128, - NVMeoFMaxNamespaces: 1024, - NVMeoFHighClientCount: 32, + NVMeoFMaxNamespaces: 2048, + NVMeoFHighClientCount: 128, NVMeoFHighHostCPU: 80, // // Read/Write latency is defined in ms diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 3440d761351..7c0da4d51a4 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -776,18 +776,18 @@ groups: type: "ceph_default" - alert: "NVMeoFTooManyGateways" annotations: - description: "You may create many gateways, but 4 is the tested limit" + description: "You may create many gateways, but 32 is the tested limit" summary: "Max supported gateways exceeded on cluster {{ $labels.cluster }}" - expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 4.00" + expr: "count(ceph_nvmeof_gateway_info) by (cluster) > 32.00" for: "1m" labels: severity: "warning" type: "ceph_default" - alert: "NVMeoFMaxGatewayGroupSize" annotations: - description: "You may create many gateways in a gateway group, but 4 is the tested limit" + description: "You may create many gateways in a gateway group, but 8 is the tested limit" summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }}" - expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 4.00" + expr: "count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00" for: "1m" labels: severity: "warning" @@ -832,7 +832,7 @@ groups: annotations: description: "Although you may continue to create namespaces in {{ $labels.gateway_host }}, the configuration may not be supported" summary: "The number of namespaces defined to the gateway exceeds supported values on cluster {{ $labels.cluster }}" - expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 1024.00" + expr: "sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,\"gateway_host\",\"$1\",\"instance\",\"(.*?)(?::.*)?\")) > 2048.00" for: "1m" labels: severity: "warning" @@ -848,9 +848,9 @@ groups: type: "ceph_default" - alert: "NVMeoFHighClientCount" annotations: - description: "The supported limit for clients connecting to a subsystem is 32" + description: "The supported limit for clients connecting to a subsystem is 128" summary: "The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }}" - expr: "ceph_nvmeof_subsystem_host_count > 32.00" + expr: "ceph_nvmeof_subsystem_host_count > 128.00" for: "1m" labels: severity: "warning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index b3b29308d08..83b4ff80375 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2331,12 +2331,69 @@ tests: values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.5",cluster="mycluster"}' values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.6",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.7",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.8",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.9",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.10",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.11",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.12",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.13",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.14",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.15",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.16",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.17",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.18",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.19",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.20",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.21",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.22",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.23",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.24",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.25",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.26",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.27",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.28",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.29",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.30",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.31",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.32",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.33",cluster="mycluster"}' + values: '1+0x20' + promql_expr_test: - - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 4.00 + - expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00 eval_time: 1m exp_samples: - labels: '{cluster="mycluster"}' - value: 5 + value: 33 alert_rule_test: - eval_time: 5m alertname: NVMeoFTooManyGateways @@ -2347,7 +2404,7 @@ tests: type: ceph_default exp_annotations: summary: "Max supported gateways exceeded on cluster mycluster" - description: "You may create many gateways, but 4 is the tested limit" + description: "You may create many gateways, but 32 is the tested limit" # NVMeoFMaxGatewayGroupSize - interval: 1m @@ -2362,16 +2419,24 @@ tests: values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.12",cluster="mycluster"}' values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.10",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.14",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.11",cluster="mycluster"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{group="group-1",addr="1.1.1.13",cluster="mycluster"}' + values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.4",cluster="mycluster"}' values: '1+0x20' - series: 'ceph_nvmeof_gateway_info{group="group-2",addr="1.1.1.5",cluster="mycluster"}' values: '1+0x20' promql_expr_test: - - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 4.00 + - expr: count(ceph_nvmeof_gateway_info) by (cluster, group) > 8.00 eval_time: 1m exp_samples: - labels: '{cluster="mycluster",group="group-1"}' - value: 5 + value: 9 alert_rule_test: - eval_time: 5m alertname: NVMeoFMaxGatewayGroupSize @@ -2383,7 +2448,7 @@ tests: type: ceph_default exp_annotations: summary: "Max gateways within a gateway group (group-1) exceeded on cluster mycluster" - description: "You may create many gateways in a gateway group, but 4 is the tested limit" + description: "You may create many gateways in a gateway group, but 8 is the tested limit" # NVMeoFSingleGatewayGroup - interval: 1m @@ -2767,12 +2832,14 @@ tests: values: '200+0x10' - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn10",cluster="mycluster"}' values: '200+0x10' + - series: 'ceph_nvmeof_subsystem_namespace_count{instance="node-1:10008",nqn="nqn11",cluster="mycluster"}' + values: '200+0x10' promql_expr_test: - - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 1024 + - expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*):.*")) > 2048 eval_time: 1m exp_samples: - labels: '{gateway_host="node-1", cluster="mycluster"}' - value: 2000 + value: 2200 alert_rule_test: - eval_time: 5m alertname: NVMeoFTooManyNamespaces @@ -2815,15 +2882,15 @@ tests: - interval: 1m input_series: - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn1",cluster="mycluster"}' - values: '2 2 2 4 4 8 8 8 10 10 20 20 32 34 34 38 38 40 44 44' + values: '2 4 8 10 20 30 40 50 62 74 80 95 100 110 130 130 130 130 130 130' - series: 'ceph_nvmeof_subsystem_host_count{nqn="nqn2",cluster="mycluster"}' - values: '2 2 2 8 8 8 16 16 16 16 16 16 16 16 16 16 16 16 16 16' + values: '2 8 16 16 16 16 16 16 16 16 20 20 32 34 34 36 37 37 37 37' promql_expr_test: - - expr: ceph_nvmeof_subsystem_host_count > 32.00 + - expr: ceph_nvmeof_subsystem_host_count > 128.00 eval_time: 15m exp_samples: - labels: '{__name__="ceph_nvmeof_subsystem_host_count",nqn="nqn1",cluster="mycluster"}' - value: 38 + value: 130 alert_rule_test: - eval_time: 20m alertname: NVMeoFHighClientCount @@ -2835,7 +2902,7 @@ tests: type: ceph_default exp_annotations: summary: "The number of clients connected to nqn1 is too high on cluster mycluster" - description: "The supported limit for clients connecting to a subsystem is 32" + description: "The supported limit for clients connecting to a subsystem is 128" # NVMeoFMissingListener - interval: 1m diff --git a/qa/config/crimson_bluestore.yaml b/qa/config/crimson_bluestore.yaml new file mode 100644 index 00000000000..d5ba487b9bf --- /dev/null +++ b/qa/config/crimson_bluestore.yaml @@ -0,0 +1,25 @@ +overrides: + ceph: + fs: xfs + conf: + osd: + # crimson's osd objectstore option + crimson osd objectstore: bluestore + debug alienstore: 20 + bluestore block size: 96636764160 + debug bluestore: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore compression mode: aggressive + bluestore fsck on mount: true + bluestore compression algorithm: snappy + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + bluestore rocksdb cf: false + log to stderr: true + err to stderr: true + log flush on exit: true + log to file: false diff --git a/qa/config/crimson_qa_overrides.yaml b/qa/config/crimson_qa_overrides.yaml index 8cf98f38001..a10c59d77cc 100644 --- a/qa/config/crimson_qa_overrides.yaml +++ b/qa/config/crimson_qa_overrides.yaml @@ -9,7 +9,6 @@ overrides: osd pool default crimson: true osd: crimson osd obc lru size: 10 - debug alienstore: 20 debug ms: 20 flavor: crimson workunit: diff --git a/qa/config/seastore.yaml b/qa/config/crimson_seastore.yaml index 6158563eedf..d1919456ab1 100644 --- a/qa/config/seastore.yaml +++ b/qa/config/crimson_seastore.yaml @@ -1,13 +1,13 @@ overrides: ceph: - fs: xfs conf: osd: - osd objectstore: seastore + # crimson's osd objectstore option + crimson osd objectstore: seastore debug seastore: 20 debug seastore onode: 20 debug seastore odata: 20 - debug seastore ompap: 20 + debug seastore omap: 20 debug seastore tm: 20 debug seastore t: 20 debug seastore cleaner: 20 diff --git a/qa/crontab/teuthology-cronjobs b/qa/crontab/teuthology-cronjobs index c979e5b105f..c558a1382ef 100644 --- a/qa/crontab/teuthology-cronjobs +++ b/qa/crontab/teuthology-cronjobs @@ -52,7 +52,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 00 05 * * 0,2,4 $CW $SS 1 --ceph main --suite smoke -p 100 --force-priority 08 05 * * 0 $CW $SS 1 --ceph squid --suite smoke -p 100 --force-priority 16 05 * * 0 $CW $SS 1 --ceph reef --suite smoke -p 100 --force-priority -24 05 * * 0 $CW $SS 1 --ceph quincy --suite smoke -p 100 --force-priority ## ********** windows tests on main branch - weekly # 00 03 * * 1 CEPH_BRANCH=main; MACHINE_NAME=smithi; $CW teuthology-suite -v -c $CEPH_BRANCH -n 100 -m $MACHINE_NAME -s windows -k distro -e $CEPH_QA_EMAIL @@ -122,7 +121,6 @@ TEUTHOLOGY_SUITE_ARGS="--non-interactive --newest=100 --ceph-repo=https://git.ce 16 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade-clients/client-upgrade-pacific-quincy --suite-branch pacific -p 820 24 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:octopus-x -p 820 32 00 * * 1 $CW $SS 120000 --ceph quincy --suite upgrade:pacific-x -p 820 -40 00 * * 1 $CW $SS 1 --ceph quincy --suite upgrade/quincy-p2p -p 820 ### upgrade runs for reef release ###### on smithi diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh index aedfbc9b5cb..f7424de8ce1 100755 --- a/qa/standalone/osd/osd-bluefs-volume-ops.sh +++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh @@ -72,7 +72,7 @@ function TEST_bluestore() { truncate $dir/0/block -s 4294967296 # 4GB ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1 - truncate $dir/1/block -s 4311744512 # 4GB + 16MB + truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240 ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1 truncate $dir/2/block -s 4295099392 # 4GB + 129KB ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1 diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh index 843e9b9901b..7b77a60f35b 100755 --- a/qa/standalone/scrub/osd-recovery-scrub.sh +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -163,7 +163,7 @@ function wait_for_scrub_mod() { fi sleep 1 # are we still the primary? - local current_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local current_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` if [ $orig_primary != $current_primary ]; then echo $orig_primary no longer primary for $pgid return 0 @@ -194,7 +194,7 @@ function pg_scrub_mod() { local last_scrub=$(get_last_scrub_stamp $pgid) # locate the primary - local my_primary=`bin/ceph pg $pgid query | jq '.acting[0]' ` + local my_primary=`./bin/ceph pg $pgid query | jq '.acting[0]' ` local recovery=false ceph pg scrub $pgid #ceph --format json pg dump pgs | jq ".pg_stats | .[] | select(.pgid == \"$pgid\") | .state" diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index 8015e023bdd..385479258f2 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -544,6 +544,9 @@ function TEST_dump_scrub_schedule() { --osd_op_queue=wpq \ --osd_stats_update_period_not_scrubbing=1 \ --osd_stats_update_period_scrubbing=1 \ + --osd_scrub_retry_after_noscrub=1 \ + --osd_scrub_retry_pg_state=2 \ + --osd_scrub_retry_delay=2 \ --osd_scrub_sleep=0.2" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -600,17 +603,16 @@ function TEST_dump_scrub_schedule() { declare -A expct_dmp_duration=( ['dmp_last_duration']="0" ['dmp_last_duration_neg']="not0" ) wait_any_cond $pgid 10 $saved_last_stamp expct_dmp_duration "WaitingAfterScrub_dmp " sched_data || return 1 - sleep 2 - # # step 2: set noscrub and request a "periodic scrub". Watch for the change in the 'is the scrub # scheduled for the future' value # - ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 - ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 ceph osd set noscrub || return 1 sleep 2 + ceph tell osd.* config set osd_shallow_scrub_chunk_max "3" || return 1 + ceph tell osd.* config set osd_scrub_sleep "2.0" || return 1 + sleep 8 saved_last_stamp=${sched_data['query_last_stamp']} ceph tell $pgid schedule-scrub @@ -683,6 +685,234 @@ function TEST_pg_dump_objects_scrubbed() { teardown $dir || return 1 } +function wait_initial_scrubs() { + local -n pg_to_prim_dict=$1 + local extr_dbg=1 # note: 3 and above leave some temp files around + + # set a long schedule for the periodic scrubs. Wait for the + # initial 'no previous scrub is known' scrubs to finish for all PGs. + ceph tell osd.* config set osd_scrub_min_interval 7200 + ceph tell osd.* config set osd_deep_scrub_interval 14400 + ceph tell osd.* config set osd_max_scrubs 32 + ceph tell osd.* config set osd_scrub_sleep 0 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 10 + ceph tell osd.* config set osd_scrub_chunk_max 10 + + for pg in "${!pg_to_prim_dict[@]}"; do + (( extr_dbg >= 1 )) && echo "Scheduling initial scrub for $pg" + ceph tell $pg scrub || return 1 + done + + sleep 1 + (( extr_dbg >= 1 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + + tout=20 + while [ $tout -gt 0 ] ; do + sleep 0.5 + (( extr_dbg >= 2 )) && ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' + not_done=$(ceph pg dump pgs --format=json-pretty | \ + jq '.pg_stats | map(select(.last_scrub_duration == 0)) | map({pgid: .pgid, last_scrub_duration: .last_scrub_duration})' | wc -l ) + # note that we should ignore a header line + if [ "$not_done" -le 1 ]; then + break + fi + not_done=$(( (not_done - 2) / 4 )) + echo "Still waiting for $not_done PGs to finish initial scrubs (timeout $tout)" + tout=$((tout - 1)) + done + (( tout == 0 )) && return 1 + return 0 +} + + +# Whenever a PG is being scrubbed at a regular, periodic, urgency, and is queued +# for its replicas: +# if the operator is requesting a scrub of the same PG, the operator's request +# should trigger an abort of the ongoing scrub. +# +# The test process: +# - a periodic scrub is initiated of a PG. That scrub is set to be a very slow one. +# - a second PG, which shares some of its replicas, is intrcuted to be scrubbed. That one +# should be stuck in replica reservation. We will verify that. +# - now - the operator is requesting that second PG to be scrubbed. The original (pending) +# scrub should be aborted. We would check for: +# - the new, operator's scrub to be scheduled +# - the replicas' reservers to be released +function TEST_abort_periodic_for_operator() { + local dir=$1 + local -A cluster_conf=( + ['osds_num']="5" + ['pgs_in_pool']="16" + ['pool_name']="test" + ) + local extr_dbg=1 # note: 3 and above leave some temp files around + + standard_scrub_wpq_cluster "$dir" cluster_conf 3 || return 1 + local poolid=${cluster_conf['pool_id']} + local poolname=${cluster_conf['pool_name']} + echo "Pool: $poolname : $poolid" + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # fill the pool with some data + TESTDATA="testdata.$$" + dd if=/dev/urandom of=$TESTDATA bs=320 count=1 + for i in $( seq 1 256 ) + do + rados -p "$poolname" put "obj${i}" $TESTDATA 2>/dev/null 1>/dev/null + done + rm -f $TESTDATA + if [[ -n "$saved_echo_flag" ]]; then set -x; fi + + # create the dictionary of the PGs in the pool + declare -A pg_pr + declare -A pg_ac + declare -A pg_po + build_pg_dicts "$dir" pg_pr pg_ac pg_po "-" + (( extr_dbg >= 2 )) && echo "PGs table:" + for pg in "${!pg_pr[@]}"; do + (( extr_dbg >= 2 )) && echo "Got: $pg: ${pg_pr[$pg]} ( ${pg_ac[$pg]} ) ${pg_po[$pg]}" + done + + wait_initial_scrubs pg_pr || return 1 + + # limit all OSDs to one scrub at a time + ceph tell osd.* config set osd_max_scrubs 1 + ceph tell osd.* config set osd_stats_update_period_not_scrubbing 1 + + # configure for slow scrubs + ceph tell osd.* config set osd_scrub_sleep 3 + ceph tell osd.* config set osd_shallow_scrub_chunk_max 2 + ceph tell osd.* config set osd_scrub_chunk_max 2 + (( extr_dbg >= 2 )) && ceph tell osd.2 dump_scrub_reservations --format=json-pretty + + # the first PG to work with: + local pg1="1.0" + # and another one, that shares its primary, and at least one more active set member + local pg2="" + for pg in "${!pg_pr[@]}"; do + if [[ "${pg_pr[$pg]}" == "${pg_pr[$pg1]}" ]]; then + local -i common=0 + count_common_active $pg $pg1 pg_ac common + if [[ $common -gt 1 ]]; then + pg2=$pg + break + fi + fi + done + if [[ -z "$pg2" ]]; then + # \todo handle the case when no such PG is found + echo "No PG found with the same primary as $pg1" + return 1 + fi + + # the common primary is allowed two concurrent scrubs + ceph tell osd."${pg_pr[$pg1]}" config set osd_max_scrubs 2 + echo "The two PGs to manipulate are $pg1 and $pg2" + + set_query_debug "$pg1" + # wait till the information published by pg1 is updated to show it as + # not being scrubbed + local is_act + for i in $( seq 1 3 ) + do + is_act=$(ceph pg "$pg1" query | jq '.scrubber.active') + if [[ "$is_act" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to finish scrubbing" + sleep 0.7 + done + ceph pg dump pgs + if [[ "$is_act" != "false" ]]; then + ceph pg "$pg1" query + echo "PG $pg1 appears to be still scrubbing" + return 1 + fi + sleep 0.5 + + echo "Initiating a periodic scrub of $pg1" + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + ceph tell $pg1 schedule-deep-scrub || return 1 + sleep 1 + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + for i in $( seq 1 14 ) + do + sleep 0.5 + stt=$(ceph pg "$pg1" query | jq '.scrubber') + is_active=$(echo $stt | jq '.active') + is_reserving_replicas=$(echo $stt | jq '.is_reserving_replicas') + if [[ "$is_active" = "true" && "$is_reserving_replicas" = "false" ]]; then + break + fi + echo "Still waiting for pg $pg1 to start scrubbing: $stt" + done + if [[ "$is_active" != "true" || "$is_reserving_replicas" != "false" ]]; then + ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + echo "The scrub is not active or is reserving replicas" + return 1 + fi + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + + # PG 1 is scrubbing, and has reserved the replicas - soem of which are shared + # by PG 2. As the max-scrubs was set to 1, that should prevent PG 2 from + # reserving its replicas. + + (( extr_dbg >= 1 )) && ceph tell osd.* dump_scrub_reservations --format=json-pretty + + # now - the 2'nd scrub - which should be blocked on reserving + set_query_debug "$pg2" + ceph tell "$pg2" schedule-deep-scrub + sleep 0.5 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + sleep 1 + (( extr_dbg >= 2 )) && echo "====================================================================================" + (( extr_dbg >= 2 )) && ceph pg "$pg2" query -f json-pretty | jq '.scrubber' + (( extr_dbg >= 2 )) && ceph pg "$pg1" query -f json-pretty | jq '.scrubber' + + # make sure pg2 scrub is stuck in the reserving state + local stt2=$(ceph pg "$pg2" query | jq '.scrubber') + local pg2_is_reserving + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_reserving" != "true" ]]; then + echo "The scheduled scrub for $pg2 should have been stuck" + ceph pg dump pgs + return 1 + fi + + # now - issue an operator-initiated scrub on pg2. + # The periodic scrub should be aborted, and the operator-initiated scrub should start. + echo "Instructing $pg2 to perform a high-priority scrub" + ceph tell "$pg2" scrub + for i in $( seq 1 10 ) + do + sleep 0.5 + stt2=$(ceph pg "$pg2" query | jq '.scrubber') + pg2_is_active=$(echo $stt2 | jq '.active') + pg2_is_reserving=$(echo $stt2 | jq '.is_reserving_replicas') + if [[ "$pg2_is_active" = "true" && "$pg2_is_reserving" != "true" ]]; then + break + fi + echo "Still waiting: $stt2" + done + + if [[ "$pg2_is_active" != "true" || "$pg2_is_reserving" = "true" ]]; then + echo "The high-priority scrub for $pg2 is not active or is reserving replicas" + return 1 + fi + echo "Done" +} + + + main osd-scrub-test "$@" # Local Variables: diff --git a/qa/standalone/scrub/scrub-helpers.sh b/qa/standalone/scrub/scrub-helpers.sh index 49b8346b8d2..dd37b643e08 100644 --- a/qa/standalone/scrub/scrub-helpers.sh +++ b/qa/standalone/scrub/scrub-helpers.sh @@ -240,8 +240,8 @@ function standard_scrub_cluster() { local saved_echo_flag=${-//[^x]/} set +x - run_mon $dir a --osd_pool_default_size=$OSDS || return 1 - run_mgr $dir x || return 1 + run_mon $dir a --osd_pool_default_size=3 || return 1 + run_mgr $dir x --mgr_stats_period=1 || return 1 local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \ --osd_scrub_interval_randomize_ratio=0 \ @@ -249,9 +249,12 @@ function standard_scrub_cluster() { --osd_pool_default_pg_autoscale_mode=off \ --osd_pg_stat_report_interval_max_seconds=1 \ --osd_pg_stat_report_interval_max_epochs=1 \ + --osd_stats_update_period_not_scrubbing=3 \ + --osd_stats_update_period_scrubbing=1 \ --osd_scrub_retry_after_noscrub=5 \ --osd_scrub_retry_pg_state=5 \ --osd_scrub_retry_delay=3 \ + --osd_pool_default_size=3 \ $extra_pars" for osd in $(seq 0 $(expr $OSDS - 1)) @@ -297,6 +300,107 @@ function standard_scrub_wpq_cluster() { } +# Parse the output of a 'pg dump pgs_brief' command and build a set of dictionaries: +# - pg_primary_dict: a dictionary of pgid -> acting_primary +# - pg_acting_dict: a dictionary of pgid -> acting set +# - pg_pool_dict: a dictionary of pgid -> pool +# If the input file is '-', the function will fetch the dump directly from the ceph cluster. +function build_pg_dicts { + local dir=$1 + local -n pg_primary_dict=$2 + local -n pg_acting_dict=$3 + local -n pg_pool_dict=$4 + local infile=$5 + + local extr_dbg=0 # note: 3 and above leave some temp files around + + #turn off '-x' (but remember previous state) + local saved_echo_flag=${-//[^x]/} + set +x + + # if the infile name is '-', fetch the dump directly from the ceph cluster + if [[ $infile == "-" ]]; then + local -r ceph_cmd="ceph pg dump pgs_brief -f=json-pretty" + local -r ceph_cmd_out=$(eval $ceph_cmd) + local -r ceph_cmd_rc=$? + if [[ $ceph_cmd_rc -ne 0 ]]; then + echo "Error: the command '$ceph_cmd' failed with return code $ceph_cmd_rc" + fi + (( extr_dbg >= 3 )) && echo "$ceph_cmd_out" > /tmp/e2 + l0=`echo "$ceph_cmd_out" | jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' ` + else + l0=`jq '[.pg_stats | group_by(.pg_stats)[0] | map({pgid: .pgid, pool: (.pgid | split(".")[0]), acting: .acting, acting_primary: .acting_primary})] | .[]' $infile ` + fi + (( extr_dbg >= 2 )) && echo "L0: $l0" + + mapfile -t l1 < <(echo "$l0" | jq -c '.[]') + (( extr_dbg >= 2 )) && echo "L1: ${#l1[@]}" + + for item in "${l1[@]}"; do + pgid=$(echo "$item" | jq -r '.pgid') + acting=$(echo "$item" | jq -r '.acting | @sh') + pg_acting_dict["$pgid"]=$acting + acting_primary=$(echo "$item" | jq -r '.acting_primary') + pg_primary_dict["$pgid"]=$acting_primary + pool=$(echo "$item" | jq -r '.pool') + pg_pool_dict["$pgid"]=$pool + done + + if [[ -n "$saved_echo_flag" ]]; then set -x; fi +} + + +# a function that counts the number of common active-set elements between two PGs +# 1 - the first PG +# 2 - the second PG +# 3 - the dictionary of active sets +function count_common_active { + local pg1=$1 + local pg2=$2 + local -n pg_acting_dict=$3 + local -n res=$4 + + local -a a1=(${pg_acting_dict[$pg1]}) + local -a a2=(${pg_acting_dict[$pg2]}) + + local -i cnt=0 + for i in "${a1[@]}"; do + for j in "${a2[@]}"; do + if [[ $i -eq $j ]]; then + cnt=$((cnt+1)) + fi + done + done + + res=$cnt +} + + +# given a PG, find another one with a disjoint active set +# - but allow a possible common Primary +# 1 - the PG +# 2 - the dictionary of active sets +# 3 - [out] - the PG with a disjoint active set +function find_disjoint_but_primary { + local pg=$1 + local -n ac_dict=$2 + local -n p_dict=$3 + local -n res=$4 + + for cand in "${!ac_dict[@]}"; do + if [[ "$cand" != "$pg" ]]; then + local -i common=0 + count_common_active "$pg" "$cand" ac_dict common + if [[ $common -eq 0 || ( $common -eq 1 && "${p_dict[$pg]}" == "${p_dict[$cand]}" )]]; then + res=$cand + return + fi + fi + done +} + + + # A debug flag is set for the PG specified, causing the 'pg query' command to display # an additional 'scrub sessions counter' field. # diff --git a/qa/suites/crimson-rados-experimental/.qa b/qa/suites/crimson-rados-experimental/.qa index fea2489fdf6..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/.qa +++ b/qa/suites/crimson-rados-experimental/.qa @@ -1 +1 @@ -../.qa
\ No newline at end of file +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml deleted file mode 120000 index bd9854e7029..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/centos_latest.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/supported/centos_latest.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml deleted file mode 100644 index d8e5898b99f..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-1.yaml +++ /dev/null @@ -1,14 +0,0 @@ -overrides: - ceph-deploy: - conf: - global: - osd pool default size: 2 - osd crush chooseleaf type: 0 - osd pool default pg num: 128 - osd pool default pgp num: 128 - ceph: - conf: - osd: - osd shutdown pgref assert: true -roles: -- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0] diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml deleted file mode 100644 index c22f08eecf8..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/ceph.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - install: - ceph: - flavor: crimson -tasks: -- install: -- ceph: - conf: - osd: - debug monc: 20 - mon: - mon min osdmap epochs: 50 - paxos service trim min: 10 - # prune full osdmaps regularly - mon osdmap full prune min: 15 - mon osdmap full prune interval: 2 - mon osdmap full prune txsize: 2 - flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml deleted file mode 120000 index 6a70c381709..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/seastore.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/config/seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml deleted file mode 100644 index ad8c921425b..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/rados_api_tests.yaml +++ /dev/null @@ -1,28 +0,0 @@ -overrides: - ceph: - log-ignorelist: - - reached quota - - but it is still running - - overall HEALTH_ - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(CACHE_POOL_NO_HIT_SET\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(PG_DEGRADED\) - conf: - client: - debug ms: 1 - mon: - mon warn on pool no app: false - osd: - osd class load list: "*" - osd class default list: "*" - osd blocked scrub grace period: 3600 -tasks: -- workunit: - clients: - client.0: - - rados/test.sh - - rados/test_pool_quota.sh diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml b/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml deleted file mode 100644 index 25efcdac83d..00000000000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/readwrite.yaml +++ /dev/null @@ -1,18 +0,0 @@ -overrides: - ceph: - crush_tunables: optimal - conf: - mon: - mon osd initial require min compat client: luminous - osd: - osd_discard_disconnected_ops: false -tasks: -- rados: - clients: [client.0] - ops: 4000 - objects: 500 - max_attr_len: 8192 - op_weights: - read: 45 - write: 45 - delete: 10 diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/% b/qa/suites/crimson-rados-experimental/thrash/% index e69de29bb2d..e69de29bb2d 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/% +++ b/qa/suites/crimson-rados-experimental/thrash/% diff --git a/qa/suites/crimson-rados-experimental/seastore/.qa b/qa/suites/crimson-rados-experimental/thrash/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/.qa b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled new file mode 120000 index 00000000000..5393a75548a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/2-size-2-min-size.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/2-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml new file mode 120000 index 00000000000..5ff70eadf75 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/0-size-min-size-overrides/3-size-2-min-size.yaml @@ -0,0 +1 @@ +.qa/overrides/3-size-2-min-size.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/normal_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados-experimental/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/$ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/deploy/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/default.yaml diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled new file mode 120000 index 00000000000..47afd70202d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-active-recovery.yaml.disabled @@ -0,0 +1 @@ +.qa/overrides/more-active-recovery.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..0bbc72db754 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-partial-recovery.yaml.disabled @@ -0,0 +1,6 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled new file mode 100644 index 00000000000..4aed086bcc3 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-async-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_async_recovery_min_cost: 1 diff --git a/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled new file mode 100644 index 00000000000..88f15f2f691 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/2-recovery-overrides/more-partial-recovery.yaml.disabled @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + global: + osd_object_clean_region_max_num_intervals: 1000 diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/+ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/+ diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/objectstore/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/.qa diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml index 9774de6887b..79641f695ab 100644 --- a/qa/suites/crimson-rados-experimental/seastore/basic/clusters/fixed-2.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/fixed-2.yaml @@ -6,6 +6,15 @@ overrides: conf: osd: osd shutdown pgref assert: true + crimson alien thread cpu cores: 6-7 + osd.0: + crimson seastar cpu cores: 0-2 + osd.1: + crimson seastar cpu cores: 3-5 + osd.2: + crimson seastar cpu cores: 0-2 + osd.3: + crimson seastar cpu cores: 3-5 global: ms cluster mode: crc ms service mode: crc diff --git a/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled new file mode 100644 index 00000000000..e559d9126e8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/clusters/openstack.yaml.disabled @@ -0,0 +1,4 @@ +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB diff --git a/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro new file mode 120000 index 00000000000..a5b729b9efa --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/crimson-supported-all-distro @@ -0,0 +1 @@ +.qa/distros/crimson-supported-all-distro/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml index 2bf67af1b18..2bf67af1b18 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/crimson_qa_overrides.yaml +++ b/qa/suites/crimson-rados-experimental/thrash/crimson_qa_overrides.yaml diff --git a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa index a602a0353e7..a602a0353e7 120000 --- a/qa/suites/crimson-rados-experimental/seastore/basic/tasks/.qa +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/.qa diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml new file mode 100644 index 00000000000..ecad09cfe3a --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/ceph.yaml @@ -0,0 +1,11 @@ +overrides: + install: + ceph: + flavor: crimson +tasks: +- install: +- ceph: + conf: + osd: + debug monc: 20 + flavor: crimson diff --git a/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled new file mode 100644 index 00000000000..0c2062240ee --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/deploy/cephadm.yaml.disabled @@ -0,0 +1,16 @@ +# no need to verify os + flavor + sha1 +verify_ceph_hash: false +tasks: +- cephadm: + conf: + mgr: + debug ms: 1 + debug mgr: 20 + debug osd: 10 +- cephadm.shell: + mon.a: + - ceph orch status + - ceph orch ps + - ceph orch ls + - ceph orch host ls + - ceph orch device ls diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml new file mode 100644 index 00000000000..aa44b6101ff --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashers/default.yaml @@ -0,0 +1,34 @@ +overrides: + ceph: + log-ignorelist: + - but it is still running + - objects unfound and apparently lost + conf: + osd: + osd debug reject backfill probability: .3 + osd scrub min interval: 60 + osd scrub max interval: 120 + osd max backfills: 3 + osd snap trim sleep: 2 + osd delete sleep: 1 + mon: + mon min osdmap epochs: 50 + paxos service trim min: 10 + # prune full osdmaps regularly + mon osdmap full prune min: 15 + mon osdmap full prune interval: 2 + mon osdmap full prune txsize: 2 +tasks: +- thrashosds: + timeout: 2400 + dump_ops_enable: false + sighup_delay: 0 + min_in: 3 + noscrub_toggle_delay: 0 + chance_thrash_pg_upmap: 0 + reweight_osd: 0 + thrash_primary_affinity: false + ceph_objectstore_tool: false + chance_inject_pause_short: 0 + chance_thrash_cluster_full: 0 + chance_reset_purged_snaps_last: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml new file mode 120000 index 00000000000..9124eb1aa29 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/thrashosds-health.yaml @@ -0,0 +1 @@ +.qa/tasks/thrashosds-health.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/.qa b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml new file mode 100644 index 00000000000..8c9764ade84 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/admin_socket_objecter_requests.yaml @@ -0,0 +1,13 @@ +overrides: + ceph: + conf: + client.0: + admin socket: /var/run/ceph/ceph-$name.asok +tasks: +- radosbench: + clients: [client.0] + time: 150 +- admin_socket: + client.0: + objecter_requests: + test: "http://git.ceph.com/?p={repo};a=blob_plain;f=src/test/admin_socket/objecter_requests;hb={branch}" diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml new file mode 100644 index 00000000000..d35e8421ab4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/pool-snaps-few-objects.yaml @@ -0,0 +1,20 @@ +overrides: + conf: + osd: + osd deep scrub update digest min age: 0 +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + pool_snaps: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml new file mode 100644 index 00000000000..902c4b56a1e --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench-high-concurrency.yaml @@ -0,0 +1,49 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 + - radosbench: + clients: [client.0] + concurrency: 128 + size: 8192 + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml new file mode 100644 index 00000000000..071f55e3928 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/radosbench.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + conf: + client.0: + debug ms: 1 + debug objecter: 20 + debug rados: 20 +tasks: +- full_sequential: + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 + - radosbench: + clients: [client.0] + time: 90 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml new file mode 100644 index 00000000000..afe04229898 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-balanced.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + balance_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml new file mode 100644 index 00000000000..445b582ea42 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects-localized.yaml @@ -0,0 +1,24 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + localize_reads: true + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml new file mode 100644 index 00000000000..e7e8070fd76 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/small-objects.yaml @@ -0,0 +1,23 @@ +overrides: + ceph: + crush_tunables: jewel +tasks: +- rados: + clients: [client.0] + ops: 400000 + max_seconds: 600 + max_in_flight: 64 + objects: 1024 + size: 16384 + max_attr_len: 8192 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 + setattr: 25 + rmattr: 25 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml new file mode 100644 index 00000000000..1161c3cc253 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-balanced.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + balance_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml new file mode 100644 index 00000000000..80af0def0e4 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects-localized.yaml @@ -0,0 +1,15 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + localize_reads: true + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml new file mode 100644 index 00000000000..0694ffcd0d6 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/snaps-few-objects.yaml @@ -0,0 +1,14 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 50 + op_weights: + read: 100 + write: 100 + delete: 50 + snap_create: 50 + snap_remove: 50 + rollback: 0 + # TODO: CEPH_OSD_OP_COPY_FROM + copy_from: 0 diff --git a/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml new file mode 100644 index 00000000000..606dcae6922 --- /dev/null +++ b/qa/suites/crimson-rados-experimental/thrash/workloads/write_fadvise_dontneed.yaml @@ -0,0 +1,8 @@ +tasks: +- rados: + clients: [client.0] + ops: 4000 + objects: 500 + write_fadvise_dontneed: true + op_weights: + write: 100 diff --git a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/basic/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/basic/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/perf/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/perf/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/rbd/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore b/qa/suites/crimson-rados/singleton/objectstore deleted file mode 120000 index dbccf5ad928..00000000000 --- a/qa/suites/crimson-rados/singleton/objectstore +++ /dev/null @@ -1 +0,0 @@ -../thrash/objectstore
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/.qa b/qa/suites/crimson-rados/singleton/objectstore/.qa new file mode 120000 index 00000000000..a602a0353e7 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/.qa @@ -0,0 +1 @@ +../.qa/
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml new file mode 120000 index 00000000000..481e393be4a --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/bluestore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/singleton/objectstore/seastore.yaml @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml index abd86d7d986..abd86d7d986 120000 --- a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml.disabled +++ b/qa/suites/crimson-rados/thrash/1-pg-log-overrides/short_pg_log.yaml diff --git a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled new file mode 120000 index 00000000000..61e26e7acf8 --- /dev/null +++ b/qa/suites/crimson-rados/thrash/objectstore/seastore.yaml.disabled @@ -0,0 +1 @@ +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml new file mode 120000 index 00000000000..abd86d7d986 --- /dev/null +++ b/qa/suites/crimson-rados/thrash_simple/1-pg-log-overrides/short_pg_log.yaml @@ -0,0 +1 @@ +.qa/overrides/short_pg_log.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml index e84f396e4b2..481e393be4a 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/bluestore.yaml @@ -1 +1 @@ -.qa/config/bluestore.yaml
\ No newline at end of file +.qa/config/crimson_bluestore.yaml
\ No newline at end of file diff --git a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml index 6a70c381709..61e26e7acf8 120000 --- a/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml +++ b/qa/suites/crimson-rados/thrash_simple/objectstore/seastore.yaml @@ -1 +1 @@ -.qa/config/seastore.yaml
\ No newline at end of file +.qa/config/crimson_seastore.yaml
\ No newline at end of file diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml index 55dde639c23..b7a0338566c 100644 --- a/qa/suites/fs/multifs/tasks/failover.yaml +++ b/qa/suites/fs/multifs/tasks/failover.yaml @@ -8,6 +8,7 @@ overrides: - \(MDS_DAMAGE\) - \(FS_DEGRADED\) - \(MDS_CACHE_OVERSIZED\) + - \(MDS_ESTIMATED_REPLAY_TIME\) ceph-fuse: disabled: true tasks: diff --git a/qa/suites/fs/nfs/tasks/nfs.yaml b/qa/suites/fs/nfs/tasks/nfs.yaml index aa966bff214..2dd668c9f88 100644 --- a/qa/suites/fs/nfs/tasks/nfs.yaml +++ b/qa/suites/fs/nfs/tasks/nfs.yaml @@ -1,3 +1,10 @@ +overrides: + install: + extra_system_packages: + rpm: + - fio + deb: + - fio tasks: - cephfs_test_runner: modules: diff --git a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml index 602d3416263..aa327b0cdf5 100644 --- a/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml +++ b/qa/suites/fs/workload/tasks/6-workunit/kernel_untar_build.yaml @@ -5,6 +5,7 @@ overrides: - "mds.dir_split" tasks: - workunit: + timeout: 5h clients: all: - kernel_untar_build.sh diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml index 372bf2561fa..8b3c4c11ac6 100644 --- a/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml +++ b/qa/suites/rados/monthrash/workloads/rados_mon_osdmap_prune.yaml @@ -15,6 +15,7 @@ overrides: # causing tests to fail due to health warns, even if # the tests themselves are successful. - \(OSDMAP_FLAGS\) + - \(PG_DEGRADED\) tasks: - workunit: clients: diff --git a/qa/suites/rados/verify/clusters/fixed-4.yaml b/qa/suites/rados/verify/clusters/fixed-4.yaml new file mode 120000 index 00000000000..aa88300715a --- /dev/null +++ b/qa/suites/rados/verify/clusters/fixed-4.yaml @@ -0,0 +1 @@ +.qa/clusters/fixed-4.yaml
\ No newline at end of file diff --git a/qa/suites/rados/verify/validater/valgrind.yaml b/qa/suites/rados/verify/validater/valgrind.yaml index e2dc29b5f7e..17cf141b0cd 100644 --- a/qa/suites/rados/verify/validater/valgrind.yaml +++ b/qa/suites/rados/verify/validater/valgrind.yaml @@ -27,6 +27,7 @@ overrides: - \(SLOW_OPS\) - slow request - OSD bench result + - OSD_DOWN valgrind: mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes] osd: [--tool=memcheck] diff --git a/qa/suites/rgw/lua/tasks/0-install.yaml b/qa/suites/rgw/lua/tasks/0-install.yaml index fa6e279145c..d85ebcc5998 100644 --- a/qa/suites/rgw/lua/tasks/0-install.yaml +++ b/qa/suites/rgw/lua/tasks/0-install.yaml @@ -3,7 +3,7 @@ tasks: - ceph: - openssl_keys: - rgw: [client.0] -- tox: [client.0] +- tox: [client.0] overrides: ceph: @@ -11,3 +11,11 @@ overrides: global: osd_min_pg_log_entries: 10 osd_max_pg_log_entries: 10 + install: + ceph: + extra_system_packages: + rpm: + - luarocks + deb: + - liblua5.3-dev + - luarocks diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/+ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/+ diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml new file mode 100644 index 00000000000..5c83d5c0d23 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/0-install.yaml @@ -0,0 +1,20 @@ +tasks: +- install: +- ceph: +- openssl_keys: +- rgw: + client.0: + +overrides: + install: + ceph: + extra_system_packages: + rpm: + - java + deb: + - default-jre + ceph: + conf: + global: + osd_min_pg_log_entries: 10 + osd_max_pg_log_entries: 10 diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros new file mode 120000 index 00000000000..46280a42a96 --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/supported-distros @@ -0,0 +1 @@ +../../.qa/distros/supported-random-distro$/
\ No newline at end of file diff --git a/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml new file mode 100644 index 00000000000..01d6fc637de --- /dev/null +++ b/qa/suites/rgw/notifications/tasks/kafka_failover/test_kafka.yaml @@ -0,0 +1,8 @@ +tasks: +- kafka-failover: + client.0: + kafka_version: 3.8.1 +- notification-tests: + client.0: + extra_attr: ["kafka_failover"] + rgw_server: client.0 diff --git a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml index 40fbcefe728..62fb6427f72 100644 --- a/qa/suites/upgrade/quincy-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/0-start.yaml @@ -32,13 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml index e27c7c0f092..f7167975aa9 100644 --- a/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/quincy-x/parallel/1-tasks.yaml @@ -1,11 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_AVAILABILITY + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml index 005514292ce..5641471629e 100644 --- a/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/quincy-x/stress-split/1-start.yaml @@ -1,17 +1,25 @@ overrides: ceph: log-ignorelist: - - \(POOL_APP_NOT_ENABLED\) + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum + - PG_AVAILABILITY - PG_DEGRADED - Reduced data availability - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED - OSDMAP_FLAGS - - PG_AVAILABILITY + - OSD_UPGRADE_FINISHED tasks: - install: branch: quincy diff --git a/qa/suites/upgrade/reef-x/parallel/0-start.yaml b/qa/suites/upgrade/reef-x/parallel/0-start.yaml index 146bd57960d..62fb6427f72 100644 --- a/qa/suites/upgrade/reef-x/parallel/0-start.yaml +++ b/qa/suites/upgrade/reef-x/parallel/0-start.yaml @@ -32,4 +32,22 @@ overrides: osd: osd shutdown pgref assert: true log-ignorelist: - - PG_DEGRADED + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN + - mons down + - mon down + - MON_DOWN + - out of quorum + - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED diff --git a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml index ce4e0cc228b..b5160c2dd00 100644 --- a/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml +++ b/qa/suites/upgrade/reef-x/parallel/1-tasks.yaml @@ -1,12 +1,8 @@ overrides: ceph: log-ignorelist: - - mons down - - mon down - - MON_DOWN - - out of quorum - - PG_AVAILABILITY - - PG_DEGRADED + - Telemetry requires re-opt-in + - telemetry module includes new collections tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml index 5e995da7d2c..fa93b2f2ece 100644 --- a/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml +++ b/qa/suites/upgrade/reef-x/parallel/overrides/ignorelist_health.yaml @@ -1,20 +1,19 @@ overrides: ceph: log-ignorelist: - - \(MDS_ALL_DOWN\) - - \(MDS_UP_LESS_THAN_MAX\) - - \(OSD_SLOW_PING_TIME + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME - reached quota + - running out of quota - overall HEALTH_ - - \(CACHE_POOL_NO_HIT_SET\) - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(SLOW_OPS\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(OBJECT_MISPLACED\) + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED - slow request - - \(MON_DOWN\) - noscrub - nodeep-scrub diff --git a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml index 992f9e1bc36..59ccfe2cd02 100644 --- a/qa/suites/upgrade/reef-x/stress-split/1-start.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/1-start.yaml @@ -1,11 +1,25 @@ overrides: ceph: log-ignorelist: + - do not have an application enabled + - application not enabled + - or freeform for custom applications + - POOL_APP_NOT_ENABLED + - is down + - OSD_DOWN - mons down - mon down - MON_DOWN - out of quorum - PG_AVAILABILITY + - PG_DEGRADED + - Reduced data availability + - Degraded data redundancy + - pg .* is stuck inactive + - pg .* is .*degraded + - FS_DEGRADED + - OSDMAP_FLAGS + - OSD_UPGRADE_FINISHED tasks: - install: branch: reef diff --git a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml index 5e995da7d2c..fa93b2f2ece 100644 --- a/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml +++ b/qa/suites/upgrade/reef-x/stress-split/overrides/ignorelist_health.yaml @@ -1,20 +1,19 @@ overrides: ceph: log-ignorelist: - - \(MDS_ALL_DOWN\) - - \(MDS_UP_LESS_THAN_MAX\) - - \(OSD_SLOW_PING_TIME + - MDS_ALL_DOWN + - MDS_UP_LESS_THAN_MAX + - OSD_SLOW_PING_TIME - reached quota + - running out of quota - overall HEALTH_ - - \(CACHE_POOL_NO_HIT_SET\) - - \(POOL_FULL\) - - \(SMALLER_PGP_NUM\) - - \(SLOW_OPS\) - - \(CACHE_POOL_NEAR_FULL\) - - \(POOL_APP_NOT_ENABLED\) - - \(PG_AVAILABILITY\) - - \(OBJECT_MISPLACED\) + - CACHE_POOL_NO_HIT_SET + - pool\(s\) full + - POOL_FULL + - SMALLER_PGP_NUM + - SLOW_OPS + - CACHE_POOL_NEAR_FULL + - OBJECT_MISPLACED - slow request - - \(MON_DOWN\) - noscrub - nodeep-scrub diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 9b04e3dc675..8f666d2fa9b 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1206,8 +1206,18 @@ def cluster(ctx, config): args.extend([ run.Raw('|'), 'head', '-n', '1', ]) - stdout = mon0_remote.sh(args) - return stdout or None + r = mon0_remote.run( + stdout=BytesIO(), + args=args, + stderr=StringIO(), + ) + stdout = r.stdout.getvalue().decode() + if stdout: + return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr + return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_ignorelist']) is not None: diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index dab61c2c700..0cde6050718 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -475,12 +475,16 @@ def ceph_log(ctx, config): run.Raw('|'), 'head', '-n', '1', ]) r = ctx.ceph[cluster_name].bootstrap_remote.run( - stdout=StringIO(), + stdout=BytesIO(), args=args, + stderr=StringIO(), ) - stdout = r.stdout.getvalue() - if stdout != '': + stdout = r.stdout.getvalue().decode() + if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None # NOTE: technically the first and third arg to first_in_ceph_log diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py index 346f139874b..468378fce3d 100644 --- a/qa/tasks/cephfs/test_exports.py +++ b/qa/tasks/cephfs/test_exports.py @@ -153,6 +153,8 @@ class TestExportPin(CephFSTestCase): # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap # to be written out every event. Yuck! self.config_set('mds', 'mds_debug_subtrees', False) + # make sure ESubtreeMap is written frequently enough: + self.config_set('mds', 'mds_log_minor_segments_per_major_segment', '4') self.config_rm('mds', 'mds bal split size') # don't split /top self.mount_a.run_shell_payload("rm -rf 1") diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 29af1e76a4f..46139163ddd 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -1,3 +1,4 @@ +import re import time import signal import logging @@ -342,6 +343,60 @@ class TestClusterResize(CephFSTestCase): self.fs.wait_for_daemons(timeout=90) +class TestFailoverBeaconHealth(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def initiate_journal_replay(self, num_files=100): + """ Initiate journal replay by creating files and restarting mds server.""" + + self.config_set("mds", "mds_delay_journal_replay_for_testing", "5000") + self.mounts[0].test_files = [str(x) for x in range(num_files)] + self.mounts[0].create_files() + self.fs.fail() + self.fs.set_joinable() + + def test_replay_beacon_estimated_time(self): + """ + That beacon emits warning message with estimated time to complete replay + """ + self.initiate_journal_replay() + self.wait_for_health("MDS_ESTIMATED_REPLAY_TIME", 60) + # remove the config so that replay finishes and the cluster + # is HEALTH_OK + self.config_rm("mds", "mds_delay_journal_replay_for_testing") + self.wait_for_health_clear(timeout=60) + + def test_replay_estimated_time_accuracy(self): + self.initiate_journal_replay(250) + def replay_complete(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + return 'MDS_ESTIMATED_REPLAY_TIME' not in codes + + def get_estimated_time(): + completion_percentage = 0.0 + time_duration = pending_duration = 0 + with safe_while(sleep=5, tries=360) as proceed: + while proceed(): + health = self.ceph_cluster.mon_manager.get_mon_health(debug=False, detail=True) + codes = [s for s in health['checks']] + if 'MDS_ESTIMATED_REPLAY_TIME' in codes: + message = health['checks']['MDS_ESTIMATED_REPLAY_TIME']['detail'][0]['message'] + ### sample warning string: "mds.a(mds.0): replay: 50.0446% complete - elapsed time: 582s, estimated time remaining: 581s" + m = re.match(".* replay: (\d+(\.\d+)?)% complete - elapsed time: (\d+)s, estimated time remaining: (\d+)s", message) + if not m: + continue + completion_percentage = float(m.group(1)) + time_duration = int(m.group(3)) + pending_duration = int(m.group(4)) + log.debug(f"MDS_ESTIMATED_REPLAY_TIME is present in health: {message}, duration: {time_duration}, completion_percentage: {completion_percentage}") + if completion_percentage >= 50: + return (completion_percentage, time_duration, pending_duration) + _, _, pending_duration = get_estimated_time() + # wait for 25% more time to avoid false negative failures + self.wait_until_true(replay_complete, timeout=pending_duration * 1.25) + class TestFailover(CephFSTestCase): CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 2 diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py index faa35be6926..0a1c07dce04 100644 --- a/qa/tasks/cephfs/test_nfs.py +++ b/qa/tasks/cephfs/test_nfs.py @@ -369,6 +369,45 @@ class TestNFS(MgrTestCase): except CommandFailedError as e: self.fail(f"expected read/write of a file to be successful but failed with {e.exitstatus}") + def _mnt_nfs(self, pseudo_path, port, ip): + ''' + Mount created export + :param pseudo_path: It is the pseudo root name + :param port: Port of deployed nfs cluster + :param ip: IP of deployed nfs cluster + ''' + tries = 3 + while True: + try: + self.ctx.cluster.run( + args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}', + f'{ip}:{pseudo_path}', '/mnt']) + break + except CommandFailedError: + if tries: + tries -= 1 + time.sleep(2) + continue + raise + + self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) + + def _test_fio(self, pseudo_path, port, ip): + ''' + run fio with libaio on /mnt/fio + :param mnt_path: nfs mount point + ''' + try: + self._mnt_nfs(pseudo_path, port, ip) + self.ctx.cluster.run(args=['mkdir', '/mnt/fio']) + fio_cmd=['sudo', 'fio', '--ioengine=libaio', '-directory=/mnt/fio', '--filename=fio.randrw.test', '--name=job', '--bs=16k', '--direct=1', '--group_reporting', '--iodepth=128', '--randrepeat=0', '--norandommap=1', '--thread=2', '--ramp_time=20s', '--offset_increment=5%', '--size=5G', '--time_based', '--runtime=300', '--ramp_time=1s', '--percentage_random=0', '--rw=randrw', '--rwmixread=50'] + self.ctx.cluster.run(args=fio_cmd) + except CommandFailedError as e: + self.fail(f"expected fio to be successful but failed with {e.exitstatus}") + finally: + self.ctx.cluster.run(args=['sudo', 'rm', '-rf', '/mnt/fio']) + self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) + def _write_to_read_only_export(self, pseudo_path, port, ip): ''' Check if write to read only export fails @@ -627,6 +666,18 @@ class TestNFS(MgrTestCase): self._test_data_read_write(self.pseudo_path, port, ip) self._test_delete_cluster() + def test_async_io_fio(self): + ''' + Test async io using fio. Expect completion without hang or crash + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path]) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._test_fio(self.pseudo_path, port, ip) + self._test_delete_cluster() + def test_cluster_info(self): ''' Test cluster info outputs correct ip and hostname diff --git a/qa/tasks/kafka_failover.py b/qa/tasks/kafka_failover.py new file mode 100644 index 00000000000..3ca60ab84fc --- /dev/null +++ b/qa/tasks/kafka_failover.py @@ -0,0 +1,244 @@ +""" +Deploy and configure Kafka for Teuthology +""" +import contextlib +import logging +import time +import os + +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +def get_kafka_version(config): + for client, client_config in config.items(): + if 'kafka_version' in client_config: + kafka_version = client_config.get('kafka_version') + return kafka_version + +kafka_prefix = 'kafka_2.13-' + +def get_kafka_dir(ctx, config): + kafka_version = get_kafka_version(config) + current_version = kafka_prefix + kafka_version + return '{tdir}/{ver}'.format(tdir=teuthology.get_testdir(ctx),ver=current_version) + + +@contextlib.contextmanager +def install_kafka(ctx, config): + """ + Downloading the kafka tar file. + """ + assert isinstance(config, dict) + log.info('Installing Kafka...') + + # programmatically find a nearby mirror so as not to hammer archive.apache.org + apache_mirror_cmd="curl 'https://www.apache.org/dyn/closer.cgi' 2>/dev/null | " \ + "grep -o '<strong>[^<]*</strong>' | sed 's/<[^>]*>//g' | head -n 1" + log.info("determining apache mirror by running: " + apache_mirror_cmd) + apache_mirror_url_front = os.popen(apache_mirror_cmd).read().rstrip() # note: includes trailing slash (/) + log.info("chosen apache mirror is " + apache_mirror_url_front) + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + test_dir=teuthology.get_testdir(ctx) + current_version = get_kafka_version(config) + + kafka_file = kafka_prefix + current_version + '.tgz' + + link1 = '{apache_mirror_url_front}/kafka/'.format(apache_mirror_url_front=apache_mirror_url_front) + \ + current_version + '/' + kafka_file + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'wget', link1], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=test_dir), run.Raw('&&'), 'tar', '-xvzf', kafka_file], + ) + + kafka_dir = get_kafka_dir(ctx, config) + # create config for second broker + second_broker_config_name = "server2.properties" + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_data_logs_escaped = "{}/logs".format(second_broker_data).replace("/", "\/") + + ctx.cluster.only(client).run( + args=['cd', '{tdir}'.format(tdir=kafka_dir), run.Raw('&&'), + 'cp', '{tdir}/config/server.properties'.format(tdir=kafka_dir), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'mkdir', '-p', '{tdir}/data'.format(tdir=kafka_dir) + ], + ) + + # edit config + ctx.cluster.only(client).run( + args=['sed', '-i', 's/broker.id=0/broker.id=1/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/advertised.listeners=PLAINTEXT:\/\/localhost:19092/g', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'sed', '-i', 's/log.dirs=\/tmp\/kafka-logs/log.dirs={}/g'.format(second_broker_data_logs_escaped), '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name), run.Raw('&&'), + 'cat', '{tdir}/config/{second_broker_config_name}'.format(tdir=kafka_dir, second_broker_config_name=second_broker_config_name) + ] + ) + + try: + yield + finally: + log.info('Removing packaged dependencies of Kafka...') + test_dir=get_kafka_dir(ctx, config) + current_version = get_kafka_version(config) + for (client,_) in config.items(): + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/logs'.format(tdir=test_dir)], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', test_dir], + ) + + ctx.cluster.only(client).run( + args=['rm', '-rf', '{tdir}/{doc}'.format(tdir=teuthology.get_testdir(ctx),doc=kafka_file)], + ) + + +@contextlib.contextmanager +def run_kafka(ctx,config): + """ + This includes two parts: + 1. Starting Zookeeper service + 2. Starting Kafka service + """ + assert isinstance(config, dict) + log.info('Bringing up Zookeeper and Kafka services...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + kafka_dir = get_kafka_dir(ctx, config) + + second_broker_data = "{tdir}/data/broker02".format(tdir=kafka_dir) + second_broker_java_log_dir = "{}/java_logs".format(second_broker_data) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './zookeeper-server-start.sh', + '{tir}/config/zookeeper.properties'.format(tir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + './kafka-server-start.sh', + '{tir}/config/server.properties'.format(tir=get_kafka_dir(ctx, config)), + run.Raw('&'), 'exit' + ], + ) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=kafka_dir), run.Raw('&&'), + run.Raw('LOG_DIR={second_broker_java_log_dir}'.format(second_broker_java_log_dir=second_broker_java_log_dir)), + './kafka-server-start.sh', '{tdir}/config/server2.properties'.format(tdir=kafka_dir), + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + log.info('Stopping Zookeeper and Kafka Services...') + + for (client, _) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-server-stop.sh', + '{tir}/config/kafka.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run( + args=['cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './zookeeper-server-stop.sh', + '{tir}/config/zookeeper.properties'.format(tir=get_kafka_dir(ctx, config)), + ], + ) + + time.sleep(5) + + ctx.cluster.only(client).run(args=['killall', '-9', 'java']) + + +@contextlib.contextmanager +def run_admin_cmds(ctx,config): + """ + Running Kafka Admin commands in order to check the working of producer anf consumer and creation of topic. + """ + assert isinstance(config, dict) + log.info('Checking kafka server through producer/consumer commands...') + for (client,_) in config.items(): + (remote,) = ctx.cluster.only(client).remotes.keys() + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-topics.sh', '--create', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + 'echo', "First", run.Raw('|'), + './kafka-console-producer.sh', '--topic', 'quickstart-events', + '--bootstrap-server', 'localhost:9092' + ], + ) + + ctx.cluster.only(client).run( + args=[ + 'cd', '{tdir}/bin'.format(tdir=get_kafka_dir(ctx, config)), run.Raw('&&'), + './kafka-console-consumer.sh', '--topic', 'quickstart-events', + '--from-beginning', + '--bootstrap-server', 'localhost:9092', + run.Raw('&'), 'exit' + ], + ) + + try: + yield + finally: + pass + + +@contextlib.contextmanager +def task(ctx,config): + """ + Following is the way how to run kafka:: + tasks: + - kafka: + client.0: + kafka_version: 2.6.0 + """ + assert config is None or isinstance(config, list) \ + or isinstance(config, dict), \ + "task kafka only supports a list or dictionary for configuration" + + all_clients = ['client.{id}'.format(id=id_) + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] + if config is None: + config = all_clients + if isinstance(config, list): + config = dict.fromkeys(config) + + log.debug('Kafka config is %s', config) + + with contextutil.nested( + lambda: install_kafka(ctx=ctx, config=config), + lambda: run_kafka(ctx=ctx, config=config), + lambda: run_admin_cmds(ctx=ctx, config=config), + ): + yield + diff --git a/qa/tasks/notification_tests.py b/qa/tasks/notification_tests.py index b4697a6f797..f1eae3c89c4 100644 --- a/qa/tasks/notification_tests.py +++ b/qa/tasks/notification_tests.py @@ -220,7 +220,7 @@ def run_tests(ctx, config): for client, client_config in config.items(): (remote,) = ctx.cluster.only(client).remotes.keys() - attr = ["!kafka_test", "!data_path_v2_kafka_test", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] + attr = ["!kafka_test", "!data_path_v2_kafka_test", "!kafka_failover", "!amqp_test", "!amqp_ssl_test", "!kafka_security_test", "!modification_required", "!manual_test", "!http_test"] if 'extra_attr' in client_config: attr = client_config.get('extra_attr') diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py index c58a7267b4e..691a6f7dd86 100644 --- a/qa/tasks/nvmeof.py +++ b/qa/tasks/nvmeof.py @@ -315,7 +315,7 @@ class NvmeofThrasher(Thrasher, Greenlet): def _get_devices(self, remote): GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \ - "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'" + "jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"Ceph bdev Controller\")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace'" devices = remote.sh(GET_DEVICE_CMD).split() return devices diff --git a/qa/tasks/rgw_multisite.py b/qa/tasks/rgw_multisite.py index e83a54efc2b..f93ca017fa2 100644 --- a/qa/tasks/rgw_multisite.py +++ b/qa/tasks/rgw_multisite.py @@ -361,6 +361,8 @@ def create_zonegroup(cluster, gateways, period, config): if endpoints: # replace client names with their gateway endpoints config['endpoints'] = extract_gateway_endpoints(gateways, endpoints) + if not config.get('api_name'): # otherwise it will be set to an empty string + config['api_name'] = config['name'] zonegroup = multisite.ZoneGroup(config['name'], period) # `zonegroup set` needs --default on command line, and 'is_master' in json args = is_default_arg(config) diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py index 6cb75173966..fae5ef3bf00 100644 --- a/qa/tasks/rook.py +++ b/qa/tasks/rook.py @@ -8,7 +8,7 @@ import json import logging import os import yaml -from io import BytesIO +from io import BytesIO, StringIO from tarfile import ReadError from tasks.ceph_manager import CephManager @@ -235,10 +235,14 @@ def ceph_log(ctx, config): r = ctx.rook[cluster_name].remote.run( stdout=BytesIO(), args=args, + stderr=StringIO(), ) stdout = r.stdout.getvalue().decode() if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh index 794353348b4..9e7a1f5134e 100755 --- a/qa/workunits/nvmeof/basic_tests.sh +++ b/qa/workunits/nvmeof/basic_tests.sh @@ -39,7 +39,7 @@ connect_all() { sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600 sleep 5 expected_devices_count=$1 - actual_devices=$(sudo nvme list --output-format=json | grep -o "$SPDK_CONTROLLER" | wc -l) + actual_devices=$(sudo nvme list --output-format=json | jq -r ".Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == \"$SPDK_CONTROLLER\")) | .Namespaces[].NameSpace" | wc -l) if [ "$actual_devices" -ne "$expected_devices_count" ]; then sudo nvme list --output-format=json return 1 @@ -74,7 +74,7 @@ test_run connect test_run list_subsys 1 test_run disconnect_all test_run list_subsys 0 -devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT)) +devices_count=$(( $NVMEOF_NAMESPACES_COUNT * $NVMEOF_SUBSYSTEMS_COUNT )) test_run connect_all $devices_count gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 )) multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh index 03fb58693bd..f7f783afc67 100755 --- a/qa/workunits/nvmeof/fio_test.sh +++ b/qa/workunits/nvmeof/fio_test.sh @@ -34,7 +34,7 @@ done fio_file=$(mktemp -t nvmeof-fio-XXXX) all_drives_list=$(sudo nvme list --output-format=json | - jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath') + jq -r '.Devices[].Subsystems[] | select(.Controllers | all(.ModelNumber == "Ceph bdev Controller")) | .Namespaces | sort_by(.NSID) | .[] | .NameSpace') # When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), # then fio runs on namespaces only in the defined range (which is 1 to 3 here). diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh index 2aa27d3d655..0ceb9ff54cf 100755 --- a/qa/workunits/rbd/cli_generic.sh +++ b/qa/workunits/rbd/cli_generic.sh @@ -914,6 +914,11 @@ test_namespace() { rbd group create rbd/test1/group1 rbd group image add rbd/test1/group1 rbd/test1/image1 + rbd group image add --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image2 + rbd group image rm --group-pool rbd --group-namespace test1 --group group1 \ + --image-pool rbd --image-namespace test1 --image image1 + rbd group image rm rbd/test1/group1 rbd/test1/image2 rbd group rm rbd/test1/group1 rbd trash move rbd/test1/image1 diff --git a/src/bash_completion/radosgw-admin b/src/bash_completion/radosgw-admin index 023a83f87e4..d9e36d8ef29 100644 --- a/src/bash_completion/radosgw-admin +++ b/src/bash_completion/radosgw-admin @@ -19,7 +19,7 @@ _radosgw_admin() if [[ ${cur} == -* ]] ; then COMPREPLY=( $(compgen -W "--uid --subuser --access-key --os-user --email --auth_uid --secret --os-secret --gen-access-key --gen-secret \ - --access --display-name --bucket --object --date --conf --name --id --version -s -w" -- ${cur}) ) + --access --display-name --bucket --object --date --conf --name --id --version -s -w --generate-key" -- ${cur}) ) return 0 fi diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py index c278de43eb0..a6d82c7f0fa 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py @@ -119,13 +119,12 @@ class Zap: osd_uuid = details.get('osd_uuid') break - for osd_uuid, details in raw_report.items(): + for _, details in raw_report.items(): device: str = details.get('device') if details.get('osd_uuid') == osd_uuid: raw_devices.add(device) return list(raw_devices) - def find_associated_devices(self) -> List[api.Volume]: """From an ``osd_id`` and/or an ``osd_fsid``, filter out all the Logical Volumes (LVs) in the diff --git a/src/ceph-volume/ceph_volume/main.py b/src/ceph-volume/ceph_volume/main.py index f8eca65ec49..4f27f429e89 100644 --- a/src/ceph-volume/ceph_volume/main.py +++ b/src/ceph-volume/ceph_volume/main.py @@ -11,8 +11,16 @@ try: from importlib.metadata import entry_points def get_entry_points(group: str): # type: ignore - return entry_points().get(group, []) # type: ignore + eps = entry_points() + if hasattr(eps, 'select'): + # New importlib.metadata uses .select() + return eps.select(group=group) + else: + # Fallback to older EntryPoints that returns dicts + return eps.get(group, []) # type: ignore + except ImportError: + # Fallback to `pkg_resources` for older versions from pkg_resources import iter_entry_points as entry_points # type: ignore def get_entry_points(group: str): # type: ignore diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py index cca64e83ab0..c971b7776ef 100644 --- a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py +++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py @@ -22,7 +22,7 @@ ceph_bluestore_tool_output = ''' "whoami": "0" }, "/dev/vdx": { - "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6", + "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b7", "size": 214748364800, "btime": "2024-10-16T10:51:05.955279+0000", "description": "main", diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index 77b55314f66..921e61a4534 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -347,12 +347,21 @@ def lsblk_all(device: str = '', return result -def is_device(dev): +def is_device(dev: str) -> bool: """ - Boolean to determine if a given device is a block device (**not** - a partition!) + Determines whether the given path corresponds to a block device (not a partition). - For example: /dev/sda would return True, but not /dev/sdc1 + This function checks whether the provided device path represents a valid block device, + such as a physical disk (/dev/sda) or an allowed loop device, but excludes partitions + (/dev/sdc1). It performs several validation steps, including file existence, path format, + device type, and additional checks for loop devices if allowed. + + Args: + dev (str): The path to the device (e.g., "/dev/sda"). + + Returns: + bool: True if the path corresponds to a valid block device (not a partition), + otherwise False. """ if not os.path.exists(dev): return False @@ -364,7 +373,7 @@ def is_device(dev): TYPE = lsblk(dev).get('TYPE') if TYPE: - return TYPE in ['disk', 'mpath'] + return TYPE in ['disk', 'mpath', 'loop'] # fallback to stat return _stat_is_device(os.lstat(dev).st_mode) and not is_partition(dev) diff --git a/src/ceph-volume/ceph_volume/util/prepare.py b/src/ceph-volume/ceph_volume/util/prepare.py index 9c863b83d93..ff7fc023fc4 100644 --- a/src/ceph-volume/ceph_volume/util/prepare.py +++ b/src/ceph-volume/ceph_volume/util/prepare.py @@ -9,6 +9,7 @@ import logging import json from ceph_volume import process, conf, terminal from ceph_volume.util import system, constants, str_to_int, disk +from typing import Optional logger = logging.getLogger(__name__) mlogger = terminal.MultiLogger(__name__) @@ -121,7 +122,7 @@ def get_block_wal_size(lv_format=True): return wal_size -def create_id(fsid, json_secrets, osd_id=None): +def create_id(fsid: str, json_secrets: str, osd_id: Optional[str]=None) -> str: """ :param fsid: The osd fsid to create, always required :param json_secrets: a json-ready object with whatever secrets are wanted diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index d2ddf564116..a8616980e4d 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -111,6 +111,7 @@ from cephadmlib.file_utils import ( unlink_file, write_new, write_tmp, + update_meta_file, ) from cephadmlib.net_utils import ( build_addrv_params, @@ -3453,6 +3454,7 @@ def list_daemons( detail: bool = True, legacy_dir: Optional[str] = None, daemon_name: Optional[str] = None, + type_of_daemon: Optional[str] = None, ) -> List[Dict[str, str]]: host_version: Optional[str] = None ls = [] @@ -3489,6 +3491,8 @@ def list_daemons( if os.path.exists(data_dir): for i in os.listdir(data_dir): if i in ['mon', 'osd', 'mds', 'mgr', 'rgw']: + if type_of_daemon and type_of_daemon != i: + continue daemon_type = i for j in os.listdir(os.path.join(data_dir, i)): if '-' not in j: @@ -3525,6 +3529,8 @@ def list_daemons( if daemon_name and name != daemon_name: continue (daemon_type, daemon_id) = j.split('.', 1) + if type_of_daemon and type_of_daemon != daemon_type: + continue unit_name = get_unit_name(fsid, daemon_type, daemon_id) @@ -4705,6 +4711,34 @@ def command_list_images(ctx: CephadmContext) -> None: # print default images cp_obj.write(sys.stdout) + +def update_service_for_daemon(ctx: CephadmContext, + available_daemons: list, + update_daemons: list) -> None: + """ Update the unit.meta file of daemon with required service name for valid daemons""" + + data = {'service_name': ctx.service_name} + # check if all the daemon names are valid + if not set(update_daemons).issubset(set(available_daemons)): + raise Error(f'Error EINVAL: one or more daemons of {update_daemons} does not exist on this host') + for name in update_daemons: + path = os.path.join(ctx.data_dir, ctx.fsid, name, 'unit.meta') + update_meta_file(path, data) + print(f'Successfully updated daemon {name} with service {ctx.service_name}') + + +@infer_fsid +def command_update_osd_service(ctx: CephadmContext) -> int: + """update service for provided daemon""" + update_daemons = [f'osd.{osd_id}' for osd_id in ctx.osd_ids.split(',')] + daemons = list_daemons(ctx, detail=False, type_of_daemon='osd') + if not daemons: + raise Error(f'Daemon {ctx.osd_ids} does not exists on this host') + available_daemons = [d['name'] for d in daemons] + update_service_for_daemon(ctx, available_daemons, update_daemons) + return 0 + + ################################## @@ -5571,6 +5605,14 @@ def _get_parser(): parser_list_images = subparsers.add_parser( 'list-images', help='list all the default images') parser_list_images.set_defaults(func=command_list_images) + + parser_update_service = subparsers.add_parser( + 'update-osd-service', help='update service for provided daemon') + parser_update_service.set_defaults(func=command_update_osd_service) + parser_update_service.add_argument('--fsid', help='cluster FSID') + parser_update_service.add_argument('--osd-ids', required=True, help='Comma-separated OSD IDs') + parser_update_service.add_argument('--service-name', required=True, help='OSD service name') + return parser diff --git a/src/cephadm/cephadmlib/daemons/monitoring.py b/src/cephadm/cephadmlib/daemons/monitoring.py index 9a9402632b0..4ba00daaefb 100644 --- a/src/cephadm/cephadmlib/daemons/monitoring.py +++ b/src/cephadm/cephadmlib/daemons/monitoring.py @@ -16,7 +16,13 @@ from ..daemon_form import register as register_daemon_form from ..daemon_identity import DaemonIdentity from ..deployment_utils import to_deployment_container from ..exceptions import Error -from ..net_utils import get_fqdn, get_hostname, get_ip_addresses, wrap_ipv6 +from ..net_utils import ( + get_fqdn, + get_hostname, + get_ip_addresses, + wrap_ipv6, + EndPoint, +) @register_daemon_form @@ -89,11 +95,6 @@ class Monitoring(ContainerDaemonForm): 'image': DefaultImages.ALERTMANAGER.image_ref, 'cpus': '2', 'memory': '2GB', - 'args': [ - '--cluster.listen-address=:{}'.format( - port_map['alertmanager'][1] - ), - ], 'config-json-files': [ 'alertmanager.yml', ], @@ -248,11 +249,14 @@ class Monitoring(ContainerDaemonForm): ip = meta['ip'] if 'ports' in meta and meta['ports']: port = meta['ports'][0] - if daemon_type == 'prometheus': - config = fetch_configs(ctx) + config = fetch_configs(ctx) + if daemon_type in ['prometheus', 'alertmanager']: ip_to_bind_to = config.get('ip_to_bind_to', '') if ip_to_bind_to: ip = ip_to_bind_to + web_listen_addr = str(EndPoint(ip, port)) + r += [f'--web.listen-address={web_listen_addr}'] + if daemon_type == 'prometheus': retention_time = config.get('retention_time', '15d') retention_size = config.get( 'retention_size', '0' @@ -276,9 +280,11 @@ class Monitoring(ContainerDaemonForm): r += ['--web.route-prefix=/prometheus/'] else: r += [f'--web.external-url={scheme}://{host}:{port}'] - r += [f'--web.listen-address={ip}:{port}'] if daemon_type == 'alertmanager': - config = fetch_configs(ctx) + clus_listen_addr = str( + EndPoint(ip, self.port_map[daemon_type][1]) + ) + r += [f'--cluster.listen-address={clus_listen_addr}'] use_url_prefix = config.get('use_url_prefix', False) peers = config.get('peers', list()) # type: ignore for peer in peers: @@ -294,13 +300,11 @@ class Monitoring(ContainerDaemonForm): if daemon_type == 'promtail': r += ['--config.expand-env'] if daemon_type == 'prometheus': - config = fetch_configs(ctx) try: r += [f'--web.config.file={config["web_config"]}'] except KeyError: pass if daemon_type == 'node-exporter': - config = fetch_configs(ctx) try: r += [f'--web.config.file={config["web_config"]}'] except KeyError: diff --git a/src/cephadm/cephadmlib/file_utils.py b/src/cephadm/cephadmlib/file_utils.py index 27e70e31756..4dd88cc3671 100644 --- a/src/cephadm/cephadmlib/file_utils.py +++ b/src/cephadm/cephadmlib/file_utils.py @@ -5,6 +5,7 @@ import datetime import logging import os import tempfile +import json from contextlib import contextmanager from pathlib import Path @@ -157,3 +158,26 @@ def unlink_file( except Exception: if not ignore_errors: raise + + +def update_meta_file(file_path: str, update_key_val: dict) -> None: + """Update key in the file with provided value""" + try: + with open(file_path, 'r') as fh: + data = json.load(fh) + file_stat = os.stat(file_path) + except FileNotFoundError: + raise + except Exception: + logger.exception(f'Failed to update {file_path}') + raise + data.update( + {key: value for key, value in update_key_val.items() if key in data} + ) + + with write_new( + file_path, + owner=(file_stat.st_uid, file_stat.st_gid), + perms=(file_stat.st_mode & 0o777), + ) as fh: + fh.write(json.dumps(data, indent=4) + '\n') diff --git a/src/cephadm/cephadmlib/net_utils.py b/src/cephadm/cephadmlib/net_utils.py index 9a7f138b1c6..bfa61d933ef 100644 --- a/src/cephadm/cephadmlib/net_utils.py +++ b/src/cephadm/cephadmlib/net_utils.py @@ -24,12 +24,22 @@ class EndPoint: def __init__(self, ip: str, port: int) -> None: self.ip = ip self.port = port + self.is_ipv4 = True + try: + if ip and ipaddress.ip_network(ip).version == 6: + self.is_ipv4 = False + except Exception: + logger.exception('Failed to check ip address version') def __str__(self) -> str: - return f'{self.ip}:{self.port}' + if self.is_ipv4: + return f'{self.ip}:{self.port}' + return f'[{self.ip}]:{self.port}' def __repr__(self) -> str: - return f'{self.ip}:{self.port}' + if self.is_ipv4: + return f'{self.ip}:{self.port}' + return f'[{self.ip}]:{self.port}' def attempt_bind(ctx, s, address, port): diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index c5094db335f..1736639ed55 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -316,7 +316,7 @@ def test_deploy_a_monitoring_container(cephadm_fs, funkypatch): runfile_lines = f.read().splitlines() assert 'podman' in runfile_lines[-1] assert runfile_lines[-1].endswith( - 'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095 --web.listen-address=1.2.3.4:9095' + 'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.listen-address=1.2.3.4:9095 --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095' ) assert '--user 8765' in runfile_lines[-1] assert f'-v /var/lib/ceph/{fsid}/prometheus.fire/etc/prometheus:/etc/prometheus:Z' in runfile_lines[-1] diff --git a/src/client/Client.cc b/src/client/Client.cc index baf8fb4299d..00b85a8e746 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -11753,8 +11753,12 @@ int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, cond_iofinish = new C_SaferCond(); filer_iofinish.reset(cond_iofinish); } else { - //Register a wrapper callback for the C_Write_Finisher which takes 'client_lock' - filer_iofinish.reset(new C_Lock_Client_Finisher(this, iofinish.get())); + //Register a wrapper callback C_Lock_Client_Finisher for the C_Write_Finisher which takes 'client_lock'. + //Use C_OnFinisher for callbacks. The op_cancel_writes has to be called without 'client_lock' held because + //the callback registered here needs to take it. This would cause incorrect lock order i.e., objecter->rwlock + //taken by objecter's op_cancel and then 'client_lock' taken by callback. To fix the lock order, queue + //the callback using the finisher + filer_iofinish.reset(new C_OnFinisher(new C_Lock_Client_Finisher(this, iofinish.get()), &objecter_finisher)); } get_cap_ref(in, CEPH_CAP_FILE_BUFFER); diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc index d5f6ba4bdee..9fd60aaff3f 100644 --- a/src/cls/rgw/cls_rgw_types.cc +++ b/src/cls/rgw/cls_rgw_types.cc @@ -194,7 +194,9 @@ void rgw_bucket_dir_entry_meta::dump(Formatter *f) const utime_t ut(mtime); encode_json("mtime", ut, f); encode_json("etag", etag, f); - encode_json("storage_class", storage_class, f); + encode_json("storage_class", + rgw_placement_rule::get_canonical_storage_class(storage_class), + f); encode_json("owner", owner, f); encode_json("owner_display_name", owner_display_name, f); encode_json("content_type", content_type, f); diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index ea3cce16609..c607839a8d2 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -13,6 +13,7 @@ if(WIN32) endif() add_subdirectory(io_exerciser) +add_subdirectory(json) add_subdirectory(options) set(common_srcs diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp index 961d9a0192e..c5fd491ed29 100644 --- a/src/common/bit_vector.hpp +++ b/src/common/bit_vector.hpp @@ -29,8 +29,8 @@ private: static const uint8_t MASK = static_cast<uint8_t>((1 << _bit_count) - 1); // must be power of 2 - BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1))); - BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE); + static_assert((_bit_count != 0) && !(_bit_count & (_bit_count - 1))); + static_assert(_bit_count <= BITS_PER_BYTE); template <typename DataIterator> class ReferenceImpl { diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h index 01feff4c063..0b05be5372e 100644 --- a/src/common/ceph_time.h +++ b/src/common/ceph_time.h @@ -342,6 +342,23 @@ public: } }; +// Please note time_guard is not thread safety -- multiple threads +// updating same diff_accumulator can corrupt it. +template <class ClockT = mono_clock> +class time_guard { + const typename ClockT::time_point start; + timespan& diff_accumulator; + +public: + time_guard(timespan& diff_accumulator) + : start(ClockT::now()), + diff_accumulator(diff_accumulator) { + } + ~time_guard() { + diff_accumulator += ClockT::now() - start; + } +}; + namespace time_detail { // So that our subtractions produce negative spans rather than // arithmetic underflow. diff --git a/src/common/config_cacher.h b/src/common/config_cacher.h index 91b8152dde1..f23195955a1 100644 --- a/src/common/config_cacher.h +++ b/src/common/config_cacher.h @@ -18,21 +18,30 @@ #include "common/config_obs.h" #include "common/config.h" +/** + * A simple class to cache a single configuration value. + * Points to note: + * - as get_tracked_conf_keys() must return a pointer to a null-terminated + * array of C-strings, 'keys' - an array - is used to hold the sole key + * that this observer is interested in. + * - the const cast should be removed once we change the + * get_tracked_conf_keys() to return const char* const * (or something + * similar). + */ template <typename ValueT> class md_config_cacher_t : public md_config_obs_t { ConfigProxy& conf; - const char* const option_name; + const char* keys[2]; std::atomic<ValueT> value_cache; const char** get_tracked_conf_keys() const override { - const static char* keys[] = { option_name, nullptr }; - return keys; + return const_cast<const char**>(keys); } void handle_conf_change(const ConfigProxy& conf, const std::set<std::string>& changed) override { - if (changed.count(option_name)) { - value_cache.store(conf.get_val<ValueT>(option_name)); + if (changed.contains(keys[0])) { + value_cache.store(conf.get_val<ValueT>(keys[0])); } } @@ -40,10 +49,10 @@ public: md_config_cacher_t(ConfigProxy& conf, const char* const option_name) : conf(conf), - option_name(option_name) { + keys{option_name, nullptr} { conf.add_observer(this); std::atomic_init(&value_cache, - conf.get_val<ValueT>(option_name)); + conf.get_val<ValueT>(keys[0])); } ~md_config_cacher_t() { diff --git a/src/common/io_exerciser/CMakeLists.txt b/src/common/io_exerciser/CMakeLists.txt index 07091df86e1..ab2e64fc222 100644 --- a/src/common/io_exerciser/CMakeLists.txt +++ b/src/common/io_exerciser/CMakeLists.txt @@ -5,9 +5,11 @@ add_library(object_io_exerciser STATIC Model.cc ObjectModel.cc RadosIo.cc + EcIoSequence.cc ) target_link_libraries(object_io_exerciser - librados + librados global + json_structures )
\ No newline at end of file diff --git a/src/common/io_exerciser/DataGenerator.cc b/src/common/io_exerciser/DataGenerator.cc index 9aa77eeb6e9..701c32fa9ec 100644 --- a/src/common/io_exerciser/DataGenerator.cc +++ b/src/common/io_exerciser/DataGenerator.cc @@ -2,32 +2,28 @@ // vim: ts=8 sw=2 smarttab #include "DataGenerator.h" -#include "ObjectModel.h" +#include <chrono> +#include <iostream> +#include <stdexcept> +#include "ObjectModel.h" #include "common/debug.h" #include "common/dout.h" - #include "fmt/format.h" #include "fmt/ranges.h" -#include <chrono> -#include <iostream> -#include <stdexcept> - #define dout_subsys ceph_subsys_rados #define dout_context g_ceph_context using DataGenerator = ceph::io_exerciser::data_generation::DataGenerator; -using SeededRandomGenerator = ceph::io_exerciser::data_generation - ::SeededRandomGenerator; -using HeaderedSeededRandomGenerator = ceph::io_exerciser::data_generation - ::HeaderedSeededRandomGenerator; +using SeededRandomGenerator = + ceph::io_exerciser::data_generation ::SeededRandomGenerator; +using HeaderedSeededRandomGenerator = + ceph::io_exerciser::data_generation ::HeaderedSeededRandomGenerator; std::unique_ptr<DataGenerator> DataGenerator::create_generator( - GenerationType generationType, const ObjectModel& model) -{ - switch(generationType) - { + GenerationType generationType, const ObjectModel& model) { + switch (generationType) { case GenerationType::SeededRandom: return std::make_unique<SeededRandomGenerator>(model); case GenerationType::HeaderedSeededRandom: @@ -39,28 +35,25 @@ std::unique_ptr<DataGenerator> DataGenerator::create_generator( return nullptr; } -bufferlist DataGenerator::generate_wrong_data(uint64_t offset, uint64_t length) -{ +bufferlist DataGenerator::generate_wrong_data(uint64_t offset, + uint64_t length) { bufferlist retlist; uint64_t block_size = m_model.get_block_size(); char buffer[block_size]; - for (uint64_t block_offset = offset; - block_offset < offset + length; - block_offset++) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { std::memset(buffer, 0, block_size); retlist.append(ceph::bufferptr(buffer, block_size)); } return retlist; } -bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, uint64_t length) -{ +bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, + uint64_t length) { return bufferlist.contents_equal(generate_data(offset, length)); } -ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) -{ +ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) { uint64_t block_size = m_model.get_block_size(); char buffer[block_size]; @@ -70,29 +63,26 @@ ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) constexpr size_t generation_length = sizeof(uint64_t); - for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--) - { + for (uint64_t i = 0; i < block_size; + i += (2 * generation_length), rand1++, rand2--) { std::memcpy(buffer + i, &rand1, generation_length); std::memcpy(buffer + i + generation_length, &rand2, generation_length); } size_t remainingBytes = block_size % (generation_length * 2); - if (remainingBytes > generation_length) - { + if (remainingBytes > generation_length) { size_t remainingBytes2 = remainingBytes - generation_length; std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2); - } - else if (remainingBytes > 0) - { + } else if (remainingBytes > 0) { std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); } return ceph::bufferptr(buffer, block_size); } -ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offset) -{ +ceph::bufferptr SeededRandomGenerator::generate_wrong_block( + uint64_t block_offset) { uint64_t block_size = m_model.get_block_size(); char buffer[block_size]; @@ -102,141 +92,134 @@ ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offse constexpr size_t generation_length = sizeof(uint64_t); - for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--) - { + for (uint64_t i = 0; i < block_size; + i += (2 * generation_length), rand1++, rand2--) { std::memcpy(buffer + i, &rand1, generation_length); std::memcpy(buffer + i + generation_length, &rand2, generation_length); } size_t remainingBytes = block_size % (generation_length * 2); - if (remainingBytes > generation_length) - { + if (remainingBytes > generation_length) { size_t remainingBytes2 = remainingBytes - generation_length; std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2); - } - else if (remainingBytes > 0) - { + } else if (remainingBytes > 0) { std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); } return ceph::bufferptr(buffer, block_size); } -bufferlist SeededRandomGenerator::generate_data(uint64_t offset, uint64_t length) -{ +bufferlist SeededRandomGenerator::generate_data(uint64_t offset, + uint64_t length) { bufferlist retlist; - for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { retlist.append(generate_block(block_offset)); } return retlist; } -bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, uint64_t length) -{ +bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, + uint64_t length) { bufferlist retlist; - for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { retlist.append(generate_wrong_block(block_offset)); } return retlist; } -HeaderedSeededRandomGenerator - ::HeaderedSeededRandomGenerator(const ObjectModel& model, - std::optional<uint64_t> unique_run_id) : - SeededRandomGenerator(model), - unique_run_id(unique_run_id.value_or(generate_unique_run_id())) -{ - -} +HeaderedSeededRandomGenerator ::HeaderedSeededRandomGenerator( + const ObjectModel& model, std::optional<uint64_t> unique_run_id) + : SeededRandomGenerator(model), + unique_run_id(unique_run_id.value_or(generate_unique_run_id())) {} -uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id() -{ +uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id() { std::mt19937_64 random_generator = - std::mt19937_64(duration_cast<std::chrono::milliseconds>( - std::chrono::system_clock::now().time_since_epoch()).count()); + std::mt19937_64(duration_cast<std::chrono::milliseconds>( + std::chrono::system_clock::now().time_since_epoch()) + .count()); - return random_generator(); + return random_generator(); } -ceph::bufferptr HeaderedSeededRandomGenerator::generate_block(uint64_t block_offset) -{ +ceph::bufferptr HeaderedSeededRandomGenerator::generate_block( + uint64_t block_offset) { SeedBytes seed = m_model.get_seed(block_offset); - TimeBytes current_time = duration_cast<std::chrono::milliseconds>( - std::chrono::system_clock::now().time_since_epoch()).count(); + TimeBytes current_time = + duration_cast<std::chrono::milliseconds>( + std::chrono::system_clock::now().time_since_epoch()) + .count(); - ceph::bufferptr bufferptr = SeededRandomGenerator::generate_block(block_offset); + ceph::bufferptr bufferptr = + SeededRandomGenerator::generate_block(block_offset); - std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, uniqueIdLength()); + std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, + uniqueIdLength()); std::memcpy(bufferptr.c_str() + seedStart(), &seed, seedLength()); std::memcpy(bufferptr.c_str() + timeStart(), ¤t_time, timeLength()); return bufferptr; } -ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block(uint64_t block_offset) -{ +ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block( + uint64_t block_offset) { return HeaderedSeededRandomGenerator::generate_block(block_offset % 8); } const HeaderedSeededRandomGenerator::UniqueIdBytes - HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset, - const bufferlist& bufferlist) -{ +HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset, + const bufferlist& bufferlist) { UniqueIdBytes read_unique_run_id = 0; - std::memcpy(&read_unique_run_id, - &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()], - uniqueIdLength()); + std::memcpy( + &read_unique_run_id, + &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()], + uniqueIdLength()); return read_unique_run_id; } const HeaderedSeededRandomGenerator::SeedBytes - HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset, - const bufferlist& bufferlist) -{ +HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset, + const bufferlist& bufferlist) { SeedBytes read_seed = 0; - std::memcpy(&read_seed, - &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()], - seedLength()); + std::memcpy( + &read_seed, + &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()], + seedLength()); return read_seed; } const HeaderedSeededRandomGenerator::TimeBytes - HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset, - const bufferlist& bufferlist) -{ +HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset, + const bufferlist& bufferlist) { TimeBytes read_time = 0; - std::memcpy(&read_time, - &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()], - timeLength()); + std::memcpy( + &read_time, + &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()], + timeLength()); return read_time; } bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist, - uint64_t offset, uint64_t length) -{ + uint64_t offset, uint64_t length) { std::vector<uint64_t> invalid_block_offsets; - for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++) - { - bool valid_block - = validate_block(block_offset, - (bufferlist.c_str() + ((block_offset - offset) * - m_model.get_block_size()))); - if (!valid_block) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { + bool valid_block = validate_block( + block_offset, (bufferlist.c_str() + + ((block_offset - offset) * m_model.get_block_size()))); + if (!valid_block) { invalid_block_offsets.push_back(block_offset); } } - if (!invalid_block_offsets.empty()) - { + if (!invalid_block_offsets.empty()) { printDebugInformationForOffsets(offset, invalid_block_offsets, bufferlist); } @@ -244,59 +227,51 @@ bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist, } bool HeaderedSeededRandomGenerator::validate_block(uint64_t block_offset, - const char* buffer_start) -{ + const char* buffer_start) { // We validate the block matches what we generate byte for byte // however we ignore the time section of the header ceph::bufferptr bufferptr = generate_block(block_offset); bool valid = strncmp(bufferptr.c_str(), buffer_start, timeStart()) == 0; - valid = valid ? strncmp(bufferptr.c_str() + timeEnd(), - buffer_start + timeEnd(), - m_model.get_block_size() - timeEnd()) == 0 : valid; + valid = valid + ? strncmp(bufferptr.c_str() + timeEnd(), buffer_start + timeEnd(), + m_model.get_block_size() - timeEnd()) == 0 + : valid; return valid; } const HeaderedSeededRandomGenerator::ErrorType - HeaderedSeededRandomGenerator::getErrorTypeForBlock(uint64_t read_offset, - uint64_t block_offset, - const bufferlist& bufferlist) -{ - try - { - UniqueIdBytes read_unique_run_id = readUniqueRunId(block_offset - read_offset, - bufferlist); - if (unique_run_id != read_unique_run_id) - { +HeaderedSeededRandomGenerator::getErrorTypeForBlock( + uint64_t read_offset, uint64_t block_offset, const bufferlist& bufferlist) { + try { + UniqueIdBytes read_unique_run_id = + readUniqueRunId(block_offset - read_offset, bufferlist); + if (unique_run_id != read_unique_run_id) { return ErrorType::RUN_ID_MISMATCH; } SeedBytes read_seed = readSeed(block_offset - read_offset, bufferlist); - if (m_model.get_seed(block_offset) != read_seed) - { + if (m_model.get_seed(block_offset) != read_seed) { return ErrorType::SEED_MISMATCH; } if (std::strncmp(&bufferlist[((block_offset - read_offset) * - m_model.get_block_size()) + bodyStart()], + m_model.get_block_size()) + + bodyStart()], generate_block(block_offset).c_str() + bodyStart(), - m_model.get_block_size() - bodyStart()) != 0) - { + m_model.get_block_size() - bodyStart()) != 0) { return ErrorType::DATA_MISMATCH; } - } - catch(const std::exception& e) - { + } catch (const std::exception& e) { return ErrorType::DATA_NOT_FOUND; } return ErrorType::UNKNOWN; } -void HeaderedSeededRandomGenerator - ::printDebugInformationForBlock(uint64_t read_offset, uint64_t block_offset, - const bufferlist& bufferlist) -{ - ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, bufferlist); +void HeaderedSeededRandomGenerator ::printDebugInformationForBlock( + uint64_t read_offset, uint64_t block_offset, const bufferlist& bufferlist) { + ErrorType blockError = + getErrorTypeForBlock(read_offset, block_offset, bufferlist); TimeBytes read_time = 0; std::time_t ttp; @@ -304,433 +279,361 @@ void HeaderedSeededRandomGenerator char read_bytes[m_model.get_block_size()]; char generated_bytes[m_model.get_block_size()]; - if (blockError == ErrorType::DATA_MISMATCH || blockError == ErrorType::UNKNOWN) - { + if (blockError == ErrorType::DATA_MISMATCH || + blockError == ErrorType::UNKNOWN) { read_time = readDateTime(block_offset - read_offset, bufferlist); - std::chrono::system_clock::time_point time_point{std::chrono::milliseconds{read_time}}; + std::chrono::system_clock::time_point time_point{ + std::chrono::milliseconds{read_time}}; ttp = std::chrono::system_clock::to_time_t(time_point); - std::memcpy(&read_bytes, - &bufferlist[((block_offset - read_offset) * m_model.get_block_size())], - m_model.get_block_size() - bodyStart()); - std::memcpy(&generated_bytes, - generate_block(block_offset).c_str(), + std::memcpy( + &read_bytes, + &bufferlist[((block_offset - read_offset) * m_model.get_block_size())], + m_model.get_block_size() - bodyStart()); + std::memcpy(&generated_bytes, generate_block(block_offset).c_str(), m_model.get_block_size() - bodyStart()); } std::string error_string; - switch(blockError) - { - case ErrorType::RUN_ID_MISMATCH: - { - UniqueIdBytes read_unique_run_id = readUniqueRunId((block_offset - read_offset), - bufferlist); - error_string = fmt::format("Header (Run ID) mismatch detected at block {} " - "(byte offset {}) Header expected run id {} but found id {}. " - "Block data corrupt or not written from this instance of this application.", - block_offset, - block_offset * m_model.get_block_size(), - unique_run_id, - read_unique_run_id); - } - break; - - case ErrorType::SEED_MISMATCH: - { + switch (blockError) { + case ErrorType::RUN_ID_MISMATCH: { + UniqueIdBytes read_unique_run_id = + readUniqueRunId((block_offset - read_offset), bufferlist); + error_string = fmt::format( + "Header (Run ID) mismatch detected at block {} " + "(byte offset {}) Header expected run id {} but found id {}. " + "Block data corrupt or not written from this instance of this " + "application.", + block_offset, block_offset * m_model.get_block_size(), unique_run_id, + read_unique_run_id); + } break; + + case ErrorType::SEED_MISMATCH: { SeedBytes read_seed = readSeed((block_offset - read_offset), bufferlist); - if (m_model.get_seed_offsets(read_seed).size() == 0) - { - error_string = fmt::format("Data (Seed) mismatch detected at block {}" - " (byte offset {}). Header expected seed {} but found seed {}. " - "Read data was not from any other recognised block in the object.", - block_offset, - block_offset * m_model.get_block_size(), - m_model.get_seed(block_offset), - read_seed); - } - else - { + if (m_model.get_seed_offsets(read_seed).size() == 0) { + error_string = fmt::format( + "Data (Seed) mismatch detected at block {}" + " (byte offset {}). Header expected seed {} but found seed {}. " + "Read data was not from any other recognised block in the object.", + block_offset, block_offset * m_model.get_block_size(), + m_model.get_seed(block_offset), read_seed); + } else { std::vector<int> seed_offsets = m_model.get_seed_offsets(read_seed); - error_string = fmt::format("Data (Seed) mismatch detected at block {}" - " (byte offset {}). Header expected seed {} but found seed {}." - " Read data was from a different block(s): {}", - block_offset, - block_offset * m_model.get_block_size(), - m_model.get_seed(block_offset), - read_seed, + error_string = fmt::format( + "Data (Seed) mismatch detected at block {}" + " (byte offset {}). Header expected seed {} but found seed {}." + " Read data was from a different block(s): {}", + block_offset, block_offset * m_model.get_block_size(), + m_model.get_seed(block_offset), read_seed, fmt::join(seed_offsets.begin(), seed_offsets.end(), "")); } - } - break; - - case ErrorType::DATA_MISMATCH: - { - error_string = fmt::format("Data (Body) mismatch detected at block {}" - " (byte offset {}). Header data matches, data body does not." - " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}", - block_offset, - block_offset * m_model.get_block_size(), + } break; + + case ErrorType::DATA_MISMATCH: { + error_string = fmt::format( + "Data (Body) mismatch detected at block {}" + " (byte offset {}). Header data matches, data body does not." + " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}", + block_offset, block_offset * m_model.get_block_size(), std::ctime(&ttp), - fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""), + fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), + ""), fmt::join(read_bytes, read_bytes + m_model.get_block_size(), "")); - } - break; + } break; - case ErrorType::DATA_NOT_FOUND: - { + case ErrorType::DATA_NOT_FOUND: { uint64_t bufferlist_length = bufferlist.to_str().size(); - error_string = fmt::format("Data (Body) could not be read at block {}" - " (byte offset {}) offset in bufferlist returned from read: {}" - " ({} bytes). Returned bufferlist length: {}.", - block_offset, - block_offset * m_model.get_block_size(), + error_string = fmt::format( + "Data (Body) could not be read at block {}" + " (byte offset {}) offset in bufferlist returned from read: {}" + " ({} bytes). Returned bufferlist length: {}.", + block_offset, block_offset * m_model.get_block_size(), (block_offset - read_offset), (block_offset - read_offset) * m_model.get_block_size(), bufferlist_length); - } - break; + } break; case ErrorType::UNKNOWN: - [[ fallthrough ]]; - - default: - { - error_string = fmt::format("Data mismatch detected at block {}" - " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}", - block_offset, - block_offset * m_model.get_block_size(), - fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""), + [[fallthrough]]; + + default: { + error_string = fmt::format( + "Data mismatch detected at block {}" + " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}", + block_offset, block_offset * m_model.get_block_size(), + fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), + ""), fmt::join(read_bytes, read_bytes + m_model.get_block_size(), "")); - } - break; + } break; } dout(0) << error_string << dendl; } -void HeaderedSeededRandomGenerator - ::printDebugInformationForRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - ErrorType rangeError, - const bufferlist& bufferlist) -{ - switch(rangeError) - { - case ErrorType::RUN_ID_MISMATCH: - printDebugInformationForRunIdMismatchRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::SEED_MISMATCH: - printDebugInformationForSeedMismatchRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::DATA_MISMATCH: - printDebugInformationDataBodyMismatchRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::DATA_NOT_FOUND: - printDebugInformationDataNotFoundRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::UNKNOWN: - [[ fallthrough ]]; - default: - printDebugInformationCorruptRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; +void HeaderedSeededRandomGenerator ::printDebugInformationForRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, ErrorType rangeError, + const bufferlist& bufferlist) { + switch (rangeError) { + case ErrorType::RUN_ID_MISMATCH: + printDebugInformationForRunIdMismatchRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::SEED_MISMATCH: + printDebugInformationForSeedMismatchRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::DATA_MISMATCH: + printDebugInformationDataBodyMismatchRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::DATA_NOT_FOUND: + printDebugInformationDataNotFoundRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::UNKNOWN: + [[fallthrough]]; + default: + printDebugInformationCorruptRange(read_offset, start_block_offset, + range_length_in_blocks, bufferlist); + break; } } -void HeaderedSeededRandomGenerator - ::printDebugInformationForRunIdMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ +void HeaderedSeededRandomGenerator ::printDebugInformationForRunIdMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { uint64_t range_start = start_block_offset; uint64_t range_length = 0; - UniqueIdBytes initial_read_unique_run_id = readUniqueRunId(start_block_offset - read_offset, - bufferlist); + UniqueIdBytes initial_read_unique_run_id = + readUniqueRunId(start_block_offset - read_offset, bufferlist); for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { - ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) - == ErrorType::RUN_ID_MISMATCH); + i < start_block_offset + range_length_in_blocks; i++) { + ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) == + ErrorType::RUN_ID_MISMATCH); - UniqueIdBytes read_unique_run_id = readUniqueRunId(i - read_offset, bufferlist); + UniqueIdBytes read_unique_run_id = + readUniqueRunId(i - read_offset, bufferlist); if (initial_read_unique_run_id != read_unique_run_id || - i == (start_block_offset + range_length_in_blocks - 1)) - { - if (range_length == 1) - { + i == (start_block_offset + range_length_in_blocks - 1)) { + if (range_length == 1) { printDebugInformationForBlock(read_offset, i, bufferlist); - } - else if (range_length > 1) - { - dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {} ({} bytes)" - " and spanning a range of {} blocks ({} bytes). " - "Expected run id {} for range but found id {}" - " for all blocks in range. " - "Block data corrupt or not written from this instance of this application.", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - unique_run_id, - initial_read_unique_run_id) << dendl; + } else if (range_length > 1) { + dout(0) + << fmt::format( + "Data (Run ID) Mismatch detected from block {} ({} bytes)" + " and spanning a range of {} blocks ({} bytes). " + "Expected run id {} for range but found id {}" + " for all blocks in range. " + "Block data corrupt or not written from this instance of " + "this application.", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + unique_run_id, initial_read_unique_run_id) + << dendl; } range_start = i; range_length = 1; initial_read_unique_run_id = read_unique_run_id; - } - else - { + } else { range_length++; } } - if (range_length == 1) - { - printDebugInformationForBlock(read_offset, - start_block_offset + range_length_in_blocks - 1, - bufferlist); - } - else if (range_length > 1) - { - dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {}" - " ({} bytes) and spanning a range of {} blocks ({} bytes). " - "Expected run id {} for range but found id for all blocks in range. " - "Block data corrupt or not written from this instance of this application.", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - unique_run_id, - initial_read_unique_run_id) + if (range_length == 1) { + printDebugInformationForBlock( + read_offset, start_block_offset + range_length_in_blocks - 1, + bufferlist); + } else if (range_length > 1) { + dout(0) << fmt::format( + "Data (Run ID) Mismatch detected from block {}" + " ({} bytes) and spanning a range of {} blocks ({} bytes). " + "Expected run id {} for range but found id for all blocks " + "in range. " + "Block data corrupt or not written from this instance of " + "this application.", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + unique_run_id, initial_read_unique_run_id) << dendl; } } -void HeaderedSeededRandomGenerator - ::printDebugInformationForSeedMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ +void HeaderedSeededRandomGenerator ::printDebugInformationForSeedMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { uint64_t range_start = start_block_offset; uint64_t range_length = 0; // Assert here if needed, as we can't support values // that can't be converted to a signed integer. - ceph_assert(m_model.get_block_size() < (std::numeric_limits<uint64_t>::max() / 2)); + ceph_assert(m_model.get_block_size() < + (std::numeric_limits<uint64_t>::max() / 2)); std::optional<int64_t> range_offset = 0; for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { - ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) - == ErrorType::SEED_MISMATCH); + i < start_block_offset + range_length_in_blocks; i++) { + ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) == + ErrorType::SEED_MISMATCH); SeedBytes read_seed = readSeed(i - read_offset, bufferlist); std::vector<int> seed_found_offsets = m_model.get_seed_offsets(read_seed); if ((seed_found_offsets.size() == 1 && - (static_cast<int64_t>(seed_found_offsets.front() - i) == range_offset)) || - range_length == 0) - { - if (range_length == 0) - { + (static_cast<int64_t>(seed_found_offsets.front() - i) == + range_offset)) || + range_length == 0) { + if (range_length == 0) { range_start = i; - if (seed_found_offsets.size() > 0) - { + if (seed_found_offsets.size() > 0) { range_offset = seed_found_offsets.front() - i; - } - else - { + } else { range_offset = std::nullopt; } } range_length++; - } - else - { - if (range_length == 1) - { + } else { + if (range_length == 1) { printDebugInformationForBlock(read_offset, i - 1, bufferlist); - } - else if (range_length > 1 && range_offset.has_value()) - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}" - " ({} bytes) and spanning a range of {} blocks ({} bytes). " - "Returned data located starting from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, range_length * m_model.get_block_size(), - static_cast<uint64_t>(*range_offset) + range_start, - (static_cast<uint64_t>(*range_offset) + range_start) - * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) - << dendl; - } - else - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}" - " ({} bytes) and spanning a range of {} blocks ({} bytes). " - "Data seed mismatch spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, range_length * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) - << dendl; + } else if (range_length > 1 && range_offset.has_value()) { + dout(0) + << fmt::format( + "Data (Seed) Mismatch detected from block {}" + " ({} bytes) and spanning a range of {} blocks ({} bytes). " + "Returned data located starting from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + static_cast<uint64_t>(*range_offset) + range_start, + (static_cast<uint64_t>(*range_offset) + range_start) * + m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) + << dendl; + } else { + dout(0) + << fmt::format( + "Data (Seed) Mismatch detected from block {}" + " ({} bytes) and spanning a range of {} blocks ({} bytes). " + "Data seed mismatch spanning a range of {} blocks ({} " + "bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) + << dendl; } range_length = 1; range_start = i; - if (seed_found_offsets.size() > 0) - { + if (seed_found_offsets.size() > 0) { range_offset = seed_found_offsets.front() - i; - } - else - { + } else { range_offset = std::nullopt; } } } - if (range_length == 1) - { - printDebugInformationForBlock(read_offset, - start_block_offset + range_length_in_blocks - 1, - bufferlist); - } - else if (range_length > 1 && range_offset.has_value()) - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes). " - "Returned data located starting from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - *range_offset + range_start, - (*range_offset + range_start) * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) + if (range_length == 1) { + printDebugInformationForBlock( + read_offset, start_block_offset + range_length_in_blocks - 1, + bufferlist); + } else if (range_length > 1 && range_offset.has_value()) { + dout(0) << fmt::format( + "Data (Seed) Mismatch detected from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes). " + "Returned data located starting from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + *range_offset + range_start, + (*range_offset + range_start) * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) << dendl; - } - else - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes). " - "and spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) + } else { + dout(0) << fmt::format( + "Data (Seed) Mismatch detected from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes). " + "and spanning a range of {} blocks ({} bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) << dendl; } } -void HeaderedSeededRandomGenerator -::printDebugInformationDataBodyMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ - dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. " - "Headers look as expected for range, " - "but generated data body does not match. " - "More information given for individual blocks below.", - start_block_offset, - start_block_offset + range_length_in_blocks - 1) +void HeaderedSeededRandomGenerator ::printDebugInformationDataBodyMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { + dout(0) << fmt::format( + "Data Mismatch detected in blocks from {} to {}. " + "Headers look as expected for range, " + "but generated data body does not match. " + "More information given for individual blocks below.", + start_block_offset, + start_block_offset + range_length_in_blocks - 1) << dendl; for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { + i < start_block_offset + range_length_in_blocks; i++) { printDebugInformationForBlock(read_offset, i, bufferlist); } } -void HeaderedSeededRandomGenerator - ::printDebugInformationCorruptRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ - dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. " - "Headers look as expected for range, " - "but generated data body does not match. " - "More information given for individual blocks below.", - start_block_offset, - start_block_offset + range_length_in_blocks - 1) +void HeaderedSeededRandomGenerator ::printDebugInformationCorruptRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { + dout(0) << fmt::format( + "Data Mismatch detected in blocks from {} to {}. " + "Headers look as expected for range, " + "but generated data body does not match. " + "More information given for individual blocks below.", + start_block_offset, + start_block_offset + range_length_in_blocks - 1) << dendl; for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { + i < start_block_offset + range_length_in_blocks; i++) { printDebugInformationForBlock(read_offset, i, bufferlist); } } -void HeaderedSeededRandomGenerator - ::printDebugInformationDataNotFoundRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ - dout(0) << fmt::format("Data not found for blocks from {} to {}. " - "More information given for individual blocks below.", - start_block_offset, - start_block_offset + range_length_in_blocks - 1) +void HeaderedSeededRandomGenerator ::printDebugInformationDataNotFoundRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { + dout(0) << fmt::format( + "Data not found for blocks from {} to {}. " + "More information given for individual blocks below.", + start_block_offset, + start_block_offset + range_length_in_blocks - 1) << dendl; - for (uint64_t i = start_block_offset; i < start_block_offset + range_length_in_blocks; i++) - { + for (uint64_t i = start_block_offset; + i < start_block_offset + range_length_in_blocks; i++) { printDebugInformationForBlock(read_offset, i, bufferlist); } } -void HeaderedSeededRandomGenerator - ::printDebugInformationForOffsets(uint64_t read_offset, - std::vector<uint64_t> offsets, - const bufferlist& bufferlist) -{ +void HeaderedSeededRandomGenerator ::printDebugInformationForOffsets( + uint64_t read_offset, std::vector<uint64_t> offsets, + const bufferlist& bufferlist) { uint64_t range_start = 0; uint64_t range_length = 0; ErrorType rangeError = ErrorType::UNKNOWN; - for (const uint64_t& block_offset : offsets) - { - ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, - bufferlist); + for (const uint64_t& block_offset : offsets) { + ErrorType blockError = + getErrorTypeForBlock(read_offset, block_offset, bufferlist); - if (range_start == 0 && range_length == 0) - { + if (range_start == 0 && range_length == 0) { range_start = block_offset; range_length = 1; rangeError = blockError; - } - else if (blockError == rangeError && - range_start + range_length == block_offset) -{ + } else if (blockError == rangeError && + range_start + range_length == block_offset) { range_length++; - } - else - { - if (range_length == 1) - { + } else { + if (range_length == 1) { printDebugInformationForBlock(read_offset, range_start, bufferlist); - } - else if (range_length > 1) - { + } else if (range_length > 1) { printDebugInformationForRange(read_offset, range_start, range_length, rangeError, bufferlist); } @@ -741,12 +644,9 @@ void HeaderedSeededRandomGenerator } } - if (range_length == 1) - { + if (range_length == 1) { printDebugInformationForBlock(read_offset, range_start, bufferlist); - } - else if (range_length > 1) - { + } else if (range_length > 1) { printDebugInformationForRange(read_offset, range_start, range_length, rangeError, bufferlist); } diff --git a/src/common/io_exerciser/DataGenerator.h b/src/common/io_exerciser/DataGenerator.h index 1e5784a54cc..c497c78ed61 100644 --- a/src/common/io_exerciser/DataGenerator.h +++ b/src/common/io_exerciser/DataGenerator.h @@ -3,8 +3,8 @@ #include <memory> #include <random> -#include "include/buffer.h" #include "ObjectModel.h" +#include "include/buffer.h" /* Overview * @@ -23,149 +23,139 @@ * * class HeaderedSeededRandomGenerator * Inherits from SeededDataGenerator. Generates entirely random patterns - * based on the seed retrieved by the model, however also appends a + * based on the seed retrieved by the model, however also appends a * header to the start of each block. This generator also provides * a range of verbose debug options to help disagnose a miscompare * whenever it detects unexpected data. */ namespace ceph { - namespace io_exerciser { - namespace data_generation { - enum class GenerationType { - SeededRandom, - HeaderedSeededRandom - // CompressedGenerator - // MixedGenerator - }; - - class DataGenerator { - public: - virtual ~DataGenerator() = default; - static std::unique_ptr<DataGenerator> - create_generator(GenerationType generatorType, - const ObjectModel& model); - virtual bufferlist generate_data(uint64_t length, uint64_t offset)=0; - virtual bool validate(bufferlist& bufferlist, uint64_t offset, - uint64_t length); - - // Used for testing debug outputs from data generation - virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length); - - protected: - const ObjectModel& m_model; - - DataGenerator(const ObjectModel& model) : m_model(model) {} - }; - - class SeededRandomGenerator : public DataGenerator - { - public: - SeededRandomGenerator(const ObjectModel& model) - : DataGenerator(model) {} - - virtual bufferptr generate_block(uint64_t offset); - virtual bufferlist generate_data(uint64_t length, uint64_t offset); - virtual bufferptr generate_wrong_block(uint64_t offset); - virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length) override; - }; - - class HeaderedSeededRandomGenerator : public SeededRandomGenerator - { - public: - HeaderedSeededRandomGenerator(const ObjectModel& model, - std::optional<uint64_t> unique_run_id = std::nullopt); - - bufferptr generate_block(uint64_t offset) override; - bufferptr generate_wrong_block(uint64_t offset) override; - bool validate(bufferlist& bufferlist, uint64_t offset, - uint64_t length) override; - - private: - using UniqueIdBytes = uint64_t; - using SeedBytes = int; - using TimeBytes = uint64_t; - - enum class ErrorType { - RUN_ID_MISMATCH, - SEED_MISMATCH, - DATA_MISMATCH, - DATA_NOT_FOUND, - UNKNOWN - }; - - constexpr uint8_t headerStart() const - { return 0; }; - constexpr uint8_t uniqueIdStart() const - { return headerStart(); }; - constexpr uint8_t uniqueIdLength() const - { return sizeof(UniqueIdBytes); }; - constexpr uint8_t seedStart() const - { return uniqueIdStart() + uniqueIdLength(); }; - constexpr uint8_t seedLength() const - { return sizeof(SeedBytes); }; - constexpr uint8_t timeStart() const - { return seedStart() + seedLength(); }; - constexpr uint8_t timeLength() const - { return sizeof(TimeBytes); }; - constexpr uint8_t timeEnd() const - { return timeStart() + timeLength(); }; - constexpr uint8_t headerLength() const - { return uniqueIdLength() + seedLength() + timeLength(); }; - constexpr uint8_t bodyStart() const - { return headerStart() + headerLength(); }; - - const UniqueIdBytes readUniqueRunId(uint64_t block_offset, - const bufferlist& bufferlist); - const SeedBytes readSeed(uint64_t block_offset, - const bufferlist& bufferlist); - const TimeBytes readDateTime(uint64_t block_offset, +namespace io_exerciser { +namespace data_generation { +enum class GenerationType { + SeededRandom, + HeaderedSeededRandom + // CompressedGenerator + // MixedGenerator +}; + +class DataGenerator { + public: + virtual ~DataGenerator() = default; + static std::unique_ptr<DataGenerator> create_generator( + GenerationType generatorType, const ObjectModel& model); + virtual bufferlist generate_data(uint64_t length, uint64_t offset) = 0; + virtual bool validate(bufferlist& bufferlist, uint64_t offset, + uint64_t length); + + // Used for testing debug outputs from data generation + virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length); + + protected: + const ObjectModel& m_model; + + DataGenerator(const ObjectModel& model) : m_model(model) {} +}; + +class SeededRandomGenerator : public DataGenerator { + public: + SeededRandomGenerator(const ObjectModel& model) : DataGenerator(model) {} + + virtual bufferptr generate_block(uint64_t offset); + bufferlist generate_data(uint64_t length, uint64_t offset) override; + virtual bufferptr generate_wrong_block(uint64_t offset); + bufferlist generate_wrong_data(uint64_t offset, + uint64_t length) override; +}; + +class HeaderedSeededRandomGenerator : public SeededRandomGenerator { + public: + HeaderedSeededRandomGenerator( + const ObjectModel& model, + std::optional<uint64_t> unique_run_id = std::nullopt); + + bufferptr generate_block(uint64_t offset) override; + bufferptr generate_wrong_block(uint64_t offset) override; + bool validate(bufferlist& bufferlist, uint64_t offset, + uint64_t length) override; + + private: + using UniqueIdBytes = uint64_t; + using SeedBytes = int; + using TimeBytes = uint64_t; + + enum class ErrorType { + RUN_ID_MISMATCH, + SEED_MISMATCH, + DATA_MISMATCH, + DATA_NOT_FOUND, + UNKNOWN + }; + + constexpr uint8_t headerStart() const { return 0; }; + constexpr uint8_t uniqueIdStart() const { return headerStart(); }; + constexpr uint8_t uniqueIdLength() const { return sizeof(UniqueIdBytes); }; + constexpr uint8_t seedStart() const { + return uniqueIdStart() + uniqueIdLength(); + }; + constexpr uint8_t seedLength() const { return sizeof(SeedBytes); }; + constexpr uint8_t timeStart() const { return seedStart() + seedLength(); }; + constexpr uint8_t timeLength() const { return sizeof(TimeBytes); }; + constexpr uint8_t timeEnd() const { return timeStart() + timeLength(); }; + constexpr uint8_t headerLength() const { + return uniqueIdLength() + seedLength() + timeLength(); + }; + constexpr uint8_t bodyStart() const { + return headerStart() + headerLength(); + }; + + const UniqueIdBytes readUniqueRunId(uint64_t block_offset, + const bufferlist& bufferlist); + const SeedBytes readSeed(uint64_t block_offset, const bufferlist& bufferlist); + const TimeBytes readDateTime(uint64_t block_offset, + const bufferlist& bufferlist); + + const UniqueIdBytes unique_run_id; + + uint64_t generate_unique_run_id(); + + bool validate_block(uint64_t block_offset, const char* buffer_start); + + const ErrorType getErrorTypeForBlock(uint64_t read_offset, + uint64_t block_offset, const bufferlist& bufferlist); - const UniqueIdBytes unique_run_id; - - uint64_t generate_unique_run_id(); - - bool validate_block(uint64_t block_offset, const char* buffer_start); - - const ErrorType getErrorTypeForBlock(uint64_t read_offset, - uint64_t block_offset, - const bufferlist& bufferlist); - - void printDebugInformationForBlock(uint64_t read_offset, - uint64_t block_offset, - const bufferlist& bufferlist); - void printDebugInformationForRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - ErrorType rangeError, - const bufferlist& bufferlist); - - void printDebugInformationForRunIdMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationForSeedMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationDataBodyMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationDataNotFoundRange(uint64_t ßread_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationCorruptRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - - void printDebugInformationForOffsets(uint64_t read_offset, - std::vector<uint64_t> offsets, - const bufferlist& bufferlist); - }; - } - } -} + void printDebugInformationForBlock(uint64_t read_offset, + uint64_t block_offset, + const bufferlist& bufferlist); + void printDebugInformationForRange(uint64_t read_offset, + uint64_t start_block_offset, + uint64_t range_length_in_blocks, + ErrorType rangeError, + const bufferlist& bufferlist); + + void printDebugInformationForRunIdMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist); + void printDebugInformationForSeedMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist); + void printDebugInformationDataBodyMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist); + void printDebugInformationDataNotFoundRange(uint64_t ßread_offset, + uint64_t start_block_offset, + uint64_t range_length_in_blocks, + const bufferlist& bufferlist); + void printDebugInformationCorruptRange(uint64_t read_offset, + uint64_t start_block_offset, + uint64_t range_length_in_blocks, + const bufferlist& bufferlist); + + void printDebugInformationForOffsets(uint64_t read_offset, + std::vector<uint64_t> offsets, + const bufferlist& bufferlist); +}; +} // namespace data_generation +} // namespace io_exerciser +} // namespace ceph diff --git a/src/common/io_exerciser/EcIoSequence.cc b/src/common/io_exerciser/EcIoSequence.cc new file mode 100644 index 00000000000..611920c96e0 --- /dev/null +++ b/src/common/io_exerciser/EcIoSequence.cc @@ -0,0 +1,267 @@ +#include "EcIoSequence.h" + +#include <memory> + +using IoOp = ceph::io_exerciser::IoOp; +using Sequence = ceph::io_exerciser::Sequence; +using IoSequence = ceph::io_exerciser::IoSequence; +using EcIoSequence = ceph::io_exerciser::EcIoSequence; +using ReadInjectSequence = ceph::io_exerciser::ReadInjectSequence; + +bool EcIoSequence::is_supported(Sequence sequence) const { return true; } + +std::unique_ptr<IoSequence> EcIoSequence::generate_sequence( + Sequence sequence, std::pair<int, int> obj_size_range, int k, int m, + int seed) { + switch (sequence) { + case Sequence::SEQUENCE_SEQ0: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ1: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ2: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ3: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ4: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ5: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ6: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ7: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ8: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ9: + return std::make_unique<ReadInjectSequence>(obj_size_range, seed, + sequence, k, m); + case Sequence::SEQUENCE_SEQ10: + return std::make_unique<Seq10>(obj_size_range, seed, k, m); + default: + ceph_abort_msg("Unrecognised sequence"); + } +} + +EcIoSequence::EcIoSequence(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), + setup_inject(false), + clear_inject(false), + shard_to_inject(std::nullopt) {} + +void EcIoSequence::select_random_data_shard_to_inject_read_error(int k, int m) { + shard_to_inject = rng(k - 1); + setup_inject = true; +} + +void EcIoSequence::select_random_data_shard_to_inject_write_error(int k, + int m) { + // Write errors do not support injecting to the primary OSD + shard_to_inject = rng(1, k - 1); + setup_inject = true; +} + +void EcIoSequence::select_random_shard_to_inject_read_error(int k, int m) { + shard_to_inject = rng(k + m - 1); + setup_inject = true; +} + +void EcIoSequence::select_random_shard_to_inject_write_error(int k, int m) { + // Write errors do not support injecting to the primary OSD + shard_to_inject = rng(1, k + m - 1); + setup_inject = true; +} + +void EcIoSequence::generate_random_read_inject_type() { + inject_op_type = static_cast<InjectOpType>( + rng(static_cast<int>(InjectOpType::ReadEIO), + static_cast<int>(InjectOpType::ReadMissingShard))); +} + +void EcIoSequence::generate_random_write_inject_type() { + inject_op_type = static_cast<InjectOpType>( + rng(static_cast<int>(InjectOpType::WriteFailAndRollback), + static_cast<int>(InjectOpType::WriteOSDAbort))); +} + +ceph::io_exerciser::ReadInjectSequence::ReadInjectSequence( + std::pair<int, int> obj_size_range, int seed, Sequence s, int k, int m) + : EcIoSequence(obj_size_range, seed) { + child_sequence = IoSequence::generate_sequence(s, obj_size_range, seed); + select_random_data_shard_to_inject_read_error(k, m); + generate_random_read_inject_type(); +} + +Sequence ceph::io_exerciser::ReadInjectSequence::get_id() const { + return child_sequence->get_id(); +} + +std::string ceph::io_exerciser::ReadInjectSequence::get_name() const { + return child_sequence->get_name() + + " running with read errors injected on shard " + + std::to_string(*shard_to_inject); +} + +std::unique_ptr<IoOp> ReadInjectSequence::next() { + step++; + + if (nextOp) { + std::unique_ptr<IoOp> retOp = nullptr; + nextOp.swap(retOp); + return retOp; + } + + std::unique_ptr<IoOp> childOp = child_sequence->next(); + + switch (childOp->getOpType()) { + case OpType::Remove: + nextOp.swap(childOp); + switch (inject_op_type) { + case InjectOpType::ReadEIO: + return ClearReadErrorInjectOp::generate(*shard_to_inject, 0); + case InjectOpType::ReadMissingShard: + return ClearReadErrorInjectOp::generate(*shard_to_inject, 1); + case InjectOpType::WriteFailAndRollback: + return ClearWriteErrorInjectOp::generate(*shard_to_inject, 0); + case InjectOpType::WriteOSDAbort: + return ClearWriteErrorInjectOp::generate(*shard_to_inject, 3); + case InjectOpType::None: + [[fallthrough]]; + default: + ceph_abort_msg("Unsupported operation"); + } + break; + case OpType::Create: + switch (inject_op_type) { + case InjectOpType::ReadEIO: + nextOp = InjectReadErrorOp::generate( + *shard_to_inject, 0, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::ReadMissingShard: + nextOp = InjectReadErrorOp::generate( + *shard_to_inject, 1, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::WriteFailAndRollback: + nextOp = InjectWriteErrorOp::generate( + *shard_to_inject, 0, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::WriteOSDAbort: + nextOp = InjectWriteErrorOp::generate( + *shard_to_inject, 3, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::None: + [[fallthrough]]; + default: + ceph_abort_msg("Unsupported operation"); + } + break; + default: + // Do nothing in default case + break; + } + + return childOp; +} + +std::unique_ptr<ceph::io_exerciser::IoOp> +ceph::io_exerciser::ReadInjectSequence::_next() { + ceph_abort_msg( + "Should not reach this point, " + "this sequence should only consume complete sequences"); + + return DoneOp::generate(); +} + +ceph::io_exerciser::Seq10::Seq10(std::pair<int, int> obj_size_range, int seed, + int k, int m) + : EcIoSequence(obj_size_range, seed), + offset(0), + length(1), + inject_error_done(false), + failed_write_done(false), + read_done(false), + successful_write_done(false), + test_all_lengths(false), // Only test length(1) due to time constraints + test_all_sizes( + false) // Only test obj_size(rand()) due to time constraints +{ + select_random_shard_to_inject_write_error(k, m); + // We will inject specifically as part of our sequence in this sequence + setup_inject = false; + if (!test_all_sizes) { + select_random_object_size(); + } +} + +Sequence ceph::io_exerciser::Seq10::get_id() const { + return Sequence::SEQUENCE_SEQ10; +} + +std::string ceph::io_exerciser::Seq10::get_name() const { + return "Sequential writes of length " + std::to_string(length) + + " with queue depth 1" + " first injecting a failed write and read it to ensure it rolls back, " + "then" + " successfully writing the data and reading the write the ensure it " + "is applied"; +} + +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq10::_next() { + if (!inject_error_done) { + inject_error_done = true; + return InjectWriteErrorOp::generate(*shard_to_inject, 0, 0, + std::numeric_limits<uint64_t>::max()); + } else if (!failed_write_done) { + failed_write_done = true; + read_done = false; + barrier = true; + return SingleFailedWriteOp::generate(offset, length); + } else if (failed_write_done && !read_done) { + read_done = true; + barrier = true; + return SingleReadOp::generate(offset, length); + } else if (!clear_inject_done) { + clear_inject_done = true; + return ClearWriteErrorInjectOp::generate(*shard_to_inject, 0); + } else if (!successful_write_done) { + successful_write_done = true; + read_done = false; + barrier = true; + return SingleWriteOp::generate(offset, length); + } else if (successful_write_done && !read_done) { + read_done = true; + return SingleReadOp::generate(offset, length); + } else if (successful_write_done && read_done) { + offset++; + inject_error_done = false; + failed_write_done = false; + read_done = false; + clear_inject_done = false; + successful_write_done = false; + + if (offset + length >= obj_size) { + if (!test_all_lengths) { + remove = true; + done = true; + return BarrierOp::generate(); + } + + offset = 0; + length++; + if (length > obj_size) { + if (!test_all_sizes) { + remove = true; + done = true; + return BarrierOp::generate(); + } + + length = 1; + return increment_object_size(); + } + } + + return BarrierOp::generate(); + } else { + ceph_abort_msg("Sequence in undefined state. Aborting"); + return DoneOp::generate(); + } +}
\ No newline at end of file diff --git a/src/common/io_exerciser/EcIoSequence.h b/src/common/io_exerciser/EcIoSequence.h new file mode 100644 index 00000000000..37283b3906b --- /dev/null +++ b/src/common/io_exerciser/EcIoSequence.h @@ -0,0 +1,65 @@ +#include "IoSequence.h" + +namespace ceph { +namespace io_exerciser { +class EcIoSequence : public IoSequence { + public: + virtual bool is_supported(Sequence sequence) const override; + static std::unique_ptr<IoSequence> generate_sequence( + Sequence s, std::pair<int, int> obj_size_range, int k, int m, int seed); + + protected: + bool setup_inject; + bool clear_inject; + std::optional<uint64_t> shard_to_inject; + InjectOpType inject_op_type; + + EcIoSequence(std::pair<int, int> obj_size_range, int seed); + + // Writes cannot be sent to injected on shard zero, so selections seperated + // out + void select_random_data_shard_to_inject_read_error(int k, int m); + void select_random_data_shard_to_inject_write_error(int k, int m); + void select_random_shard_to_inject_read_error(int k, int m); + void select_random_shard_to_inject_write_error(int k, int m); + void generate_random_read_inject_type(); + void generate_random_write_inject_type(); +}; + +class ReadInjectSequence : public EcIoSequence { + public: + ReadInjectSequence(std::pair<int, int> obj_size_range, int seed, Sequence s, + int k, int m); + + Sequence get_id() const override; + std::string get_name() const override; + virtual std::unique_ptr<IoOp> next() override; + std::unique_ptr<IoOp> _next() override; + + private: + std::unique_ptr<IoSequence> child_sequence; + std::unique_ptr<IoOp> nextOp; +}; + +class Seq10 : public EcIoSequence { + public: + Seq10(std::pair<int, int> obj_size_range, int seed, int k, int m); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; + + bool inject_error_done; + bool failed_write_done; + bool read_done; + bool clear_inject_done; + bool successful_write_done; + bool test_all_lengths; + bool test_all_sizes; +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/IoOp.cc b/src/common/io_exerciser/IoOp.cc index cd855ba6fff..493d1f435b4 100644 --- a/src/common/io_exerciser/IoOp.cc +++ b/src/common/io_exerciser/IoOp.cc @@ -1,188 +1,316 @@ #include "IoOp.h" -using IoOp = ceph::io_exerciser::IoOp; +#include "fmt/format.h" +#include "include/ceph_assert.h" -IoOp::IoOp( OpType op, - uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3) : - op(op), - offset1(offset1), length1(length1), - offset2(offset2), length2(length2), - offset3(offset3), length3(length3) -{ +using IoOp = ceph::io_exerciser::IoOp; +using OpType = ceph::io_exerciser::OpType; -} +using DoneOp = ceph::io_exerciser::DoneOp; +using BarrierOp = ceph::io_exerciser::BarrierOp; +using CreateOp = ceph::io_exerciser::CreateOp; +using RemoveOp = ceph::io_exerciser::RemoveOp; +using SingleReadOp = ceph::io_exerciser::SingleReadOp; +using DoubleReadOp = ceph::io_exerciser::DoubleReadOp; +using TripleReadOp = ceph::io_exerciser::TripleReadOp; +using SingleWriteOp = ceph::io_exerciser::SingleWriteOp; +using DoubleWriteOp = ceph::io_exerciser::DoubleWriteOp; +using TripleWriteOp = ceph::io_exerciser::TripleWriteOp; +using SingleFailedWriteOp = ceph::io_exerciser::SingleFailedWriteOp; +using DoubleFailedWriteOp = ceph::io_exerciser::DoubleFailedWriteOp; +using TripleFailedWriteOp = ceph::io_exerciser::TripleFailedWriteOp; -std::string IoOp::value_to_string(uint64_t v) const -{ +namespace { +std::string value_to_string(uint64_t v) { if (v < 1024 || (v % 1024) != 0) { return std::to_string(v); - }else if (v < 1024*1024 || (v % (1024 * 1024)) != 0 ) { + } else if (v < 1024 * 1024 || (v % (1024 * 1024)) != 0) { return std::to_string(v / 1024) + "K"; - }else{ + } else { return std::to_string(v / 1024 / 1024) + "M"; } } +} // namespace -std::unique_ptr<IoOp> IoOp - ::generate_done() { +IoOp::IoOp() {} - return std::make_unique<IoOp>(OpType::Done); -} +template <OpType opType> +ceph::io_exerciser::TestOp<opType>::TestOp() : IoOp() {} + +DoneOp::DoneOp() : TestOp<OpType::Done>() {} -std::unique_ptr<IoOp> IoOp - ::generate_barrier() { +std::string DoneOp::to_string(uint64_t block_size) const { return "Done"; } - return std::make_unique<IoOp>(OpType::BARRIER); +std::unique_ptr<DoneOp> DoneOp::generate() { + return std::make_unique<DoneOp>(); } -std::unique_ptr<IoOp> IoOp - ::generate_create(uint64_t size) { +BarrierOp::BarrierOp() : TestOp<OpType::Barrier>() {} - return std::make_unique<IoOp>(OpType::CREATE,0,size); +std::unique_ptr<BarrierOp> BarrierOp::generate() { + return std::make_unique<BarrierOp>(); } -std::unique_ptr<IoOp> IoOp - ::generate_remove() { - - return std::make_unique<IoOp>(OpType::REMOVE); +std::string BarrierOp::to_string(uint64_t block_size) const { + return "Barrier"; } -std::unique_ptr<IoOp> IoOp - ::generate_read(uint64_t offset, uint64_t length) { +CreateOp::CreateOp(uint64_t size) : TestOp<OpType::Create>(), size(size) {} - return std::make_unique<IoOp>(OpType::READ, offset, length); +std::unique_ptr<CreateOp> CreateOp::generate(uint64_t size) { + return std::make_unique<CreateOp>(size); } -std::unique_ptr<IoOp> IoOp - ::generate_read2(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2) { +std::string CreateOp::to_string(uint64_t block_size) const { + return "Create (size=" + value_to_string(size * block_size) + ")"; +} - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); - } +RemoveOp::RemoveOp() : TestOp<OpType::Remove>() {} - return std::make_unique<IoOp>(OpType::READ2, - offset1, length1, - offset2, length2); +std::unique_ptr<RemoveOp> RemoveOp::generate() { + return std::make_unique<RemoveOp>(); } -std::unique_ptr<IoOp> IoOp - ::generate_read3(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3) { +std::string RemoveOp::to_string(uint64_t block_size) const { return "Remove"; } - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); +template <OpType opType, int numIOs> +ceph::io_exerciser::ReadWriteOp<opType, numIOs>::ReadWriteOp( + std::array<uint64_t, numIOs>&& offset, + std::array<uint64_t, numIOs>&& length) + : TestOp<opType>(), offset(offset), length(length) { + auto compare = [](uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2) { + if (offset1 < offset2) { + ceph_assert(offset1 + length1 <= offset2); + } else { + ceph_assert(offset2 + length2 <= offset1); + } + }; + + if (numIOs > 1) { + for (int i = 0; i < numIOs - 1; i++) { + for (int j = i + 1; j < numIOs; j++) { + compare(offset[i], length[i], offset[j], length[j]); + } + } } - if (offset1 < offset3) { - ceph_assert( offset1 + length1 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset1 ); +} + +template <OpType opType, int numIOs> +std::string ceph::io_exerciser::ReadWriteOp<opType, numIOs>::to_string( + uint64_t block_size) const { + std::string offset_length_desc; + if (numIOs > 0) { + offset_length_desc += fmt::format( + "offset1={}", value_to_string(this->offset[0] * block_size)); + offset_length_desc += fmt::format( + ",length1={}", value_to_string(this->length[0] * block_size)); + for (int i = 1; i < numIOs; i++) { + offset_length_desc += fmt::format( + ",offset{}={}", i + 1, value_to_string(this->offset[i] * block_size)); + offset_length_desc += fmt::format( + ",length{}={}", i + 1, value_to_string(this->length[i] * block_size)); + } } - if (offset2 < offset3) { - ceph_assert( offset2 + length2 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset2 ); + switch (opType) { + case OpType::Read: + [[fallthrough]]; + case OpType::Read2: + [[fallthrough]]; + case OpType::Read3: + return fmt::format("Read{} ({})", numIOs, offset_length_desc); + case OpType::Write: + [[fallthrough]]; + case OpType::Write2: + [[fallthrough]]; + case OpType::Write3: + return fmt::format("Write{} ({})", numIOs, offset_length_desc); + case OpType::FailedWrite: + [[fallthrough]]; + case OpType::FailedWrite2: + [[fallthrough]]; + case OpType::FailedWrite3: + return fmt::format("FailedWrite{} ({})", numIOs, offset_length_desc); + default: + ceph_abort_msg( + fmt::format("Unsupported op type by ReadWriteOp ({})", opType)); } - return std::make_unique<IoOp>(OpType::READ3, - offset1, length1, - offset2, length2, - offset3, length3); } -std::unique_ptr<IoOp> IoOp::generate_write(uint64_t offset, uint64_t length) { - return std::make_unique<IoOp>(OpType::WRITE, offset, length); +SingleReadOp::SingleReadOp(uint64_t offset, uint64_t length) + : ReadWriteOp<OpType::Read, 1>({offset}, {length}) {} + +std::unique_ptr<SingleReadOp> SingleReadOp::generate(uint64_t offset, + uint64_t length) { + return std::make_unique<SingleReadOp>(offset, length); } -std::unique_ptr<IoOp> IoOp::generate_write2(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2) { - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); - } - return std::make_unique<IoOp>(OpType::WRITE2, - offset1, length1, - offset2, length2); +DoubleReadOp::DoubleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2) + : ReadWriteOp<OpType::Read2, 2>({offset1, offset2}, {length1, length2}) {} + +std::unique_ptr<DoubleReadOp> DoubleReadOp::generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2) { + return std::make_unique<DoubleReadOp>(offset1, length1, offset2, length2); } -std::unique_ptr<IoOp> IoOp::generate_write3(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3) { - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); - } - if (offset1 < offset3) { - ceph_assert( offset1 + length1 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset1 ); - } - if (offset2 < offset3) { - ceph_assert( offset2 + length2 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset2 ); - } - return std::make_unique<IoOp>(OpType::WRITE3, - offset1, length1, - offset2, length2, - offset3, length3); -} - -bool IoOp::done() { - return (op == OpType::Done); -} - -std::string IoOp::to_string(uint64_t block_size) const -{ - switch (op) { - case OpType::Done: - return "Done"; - case OpType::BARRIER: - return "Barrier"; - case OpType::CREATE: - return "Create (size=" + value_to_string(length1 * block_size) + ")"; - case OpType::REMOVE: - return "Remove"; - case OpType::READ: - return "Read (offset=" + value_to_string(offset1 * block_size) + - ",length=" + value_to_string(length1 * block_size) + ")"; - case OpType::READ2: - return "Read2 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + ")"; - case OpType::READ3: - return "Read3 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + - ",offset3=" + value_to_string(offset3 * block_size) + - ",length3=" + value_to_string(length3 * block_size) + ")"; - case OpType::WRITE: - return "Write (offset=" + value_to_string(offset1 * block_size) + - ",length=" + value_to_string(length1 * block_size) + ")"; - case OpType::WRITE2: - return "Write2 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + ")"; - case OpType::WRITE3: - return "Write3 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + - ",offset3=" + value_to_string(offset3 * block_size) + - ",length3=" + value_to_string(length3 * block_size) + ")"; - default: - break; - } - return "Unknown"; +TripleReadOp::TripleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3) + : ReadWriteOp<OpType::Read3, 3>({offset1, offset2, offset3}, + {length1, length2, length3}) {} + +std::unique_ptr<TripleReadOp> TripleReadOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) { + return std::make_unique<TripleReadOp>(offset1, length1, offset2, length2, + offset3, length3); +} + +SingleWriteOp::SingleWriteOp(uint64_t offset, uint64_t length) + : ReadWriteOp<OpType::Write, 1>({offset}, {length}) {} + +std::unique_ptr<SingleWriteOp> SingleWriteOp::generate(uint64_t offset, + uint64_t length) { + return std::make_unique<SingleWriteOp>(offset, length); +} + +DoubleWriteOp::DoubleWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2) + : ReadWriteOp<OpType::Write2, 2>({offset1, offset2}, {length1, length2}) {} + +std::unique_ptr<DoubleWriteOp> DoubleWriteOp::generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2) { + return std::make_unique<DoubleWriteOp>(offset1, length1, offset2, length2); +} + +TripleWriteOp::TripleWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) + : ReadWriteOp<OpType::Write3, 3>({offset1, offset2, offset3}, + {length1, length2, length3}) {} + +std::unique_ptr<TripleWriteOp> TripleWriteOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) { + return std::make_unique<TripleWriteOp>(offset1, length1, offset2, length2, + offset3, length3); +} + +SingleFailedWriteOp::SingleFailedWriteOp(uint64_t offset, uint64_t length) + : ReadWriteOp<OpType::FailedWrite, 1>({offset}, {length}) {} + +std::unique_ptr<SingleFailedWriteOp> SingleFailedWriteOp::generate( + uint64_t offset, uint64_t length) { + return std::make_unique<SingleFailedWriteOp>(offset, length); +} + +DoubleFailedWriteOp::DoubleFailedWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2) + : ReadWriteOp<OpType::FailedWrite2, 2>({offset1, offset2}, + {length1, length2}) {} + +std::unique_ptr<DoubleFailedWriteOp> DoubleFailedWriteOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2) { + return std::make_unique<DoubleFailedWriteOp>(offset1, length1, offset2, + length2); +} + +TripleFailedWriteOp::TripleFailedWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) + : ReadWriteOp<OpType::FailedWrite3, 3>({offset1, offset2, offset3}, + {length1, length2, length3}) {} + +std::unique_ptr<TripleFailedWriteOp> TripleFailedWriteOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) { + return std::make_unique<TripleFailedWriteOp>(offset1, length1, offset2, + length2, offset3, length3); +} + +template <ceph::io_exerciser::OpType opType> +ceph::io_exerciser::InjectErrorOp<opType>::InjectErrorOp( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) + : TestOp<opType>(), + shard(shard), + type(type), + when(when), + duration(duration) {} + +template <ceph::io_exerciser::OpType opType> +std::string ceph::io_exerciser::InjectErrorOp<opType>::to_string( + uint64_t blocksize) const { + std::string_view inject_type = get_inject_type_string(); + return fmt::format( + "Inject {} error on shard {} of type {}" + " after {} successful inject(s) lasting {} inject(s)", + inject_type, shard, type.value_or(0), when.value_or(0), + duration.value_or(1)); +} + +ceph::io_exerciser::InjectReadErrorOp::InjectReadErrorOp( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) + : InjectErrorOp<OpType::InjectReadError>(shard, type, when, duration) {} + +std::unique_ptr<ceph::io_exerciser::InjectReadErrorOp> +ceph::io_exerciser ::InjectReadErrorOp::generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) { + return std::make_unique<InjectReadErrorOp>(shard, type, when, duration); +} + +ceph::io_exerciser::InjectWriteErrorOp::InjectWriteErrorOp( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) + : InjectErrorOp<OpType::InjectWriteError>(shard, type, when, duration) {} + +std::unique_ptr<ceph::io_exerciser::InjectWriteErrorOp> +ceph::io_exerciser ::InjectWriteErrorOp::generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) { + return std::make_unique<InjectWriteErrorOp>(shard, type, when, duration); +} + +template <ceph::io_exerciser::OpType opType> +ceph::io_exerciser::ClearErrorInjectOp<opType>::ClearErrorInjectOp( + int shard, const std::optional<uint64_t>& type) + : TestOp<opType>(), shard(shard), type(type) {} + +template <ceph::io_exerciser::OpType opType> +std::string ceph::io_exerciser::ClearErrorInjectOp<opType>::to_string( + uint64_t blocksize) const { + std::string_view inject_type = get_inject_type_string(); + return fmt::format("Clear {} injects on shard {} of type {}", inject_type, + shard, type.value_or(0)); +} + +ceph::io_exerciser::ClearReadErrorInjectOp::ClearReadErrorInjectOp( + int shard, const std::optional<uint64_t>& type) + : ClearErrorInjectOp<OpType::ClearReadErrorInject>(shard, type) {} + +std::unique_ptr<ceph::io_exerciser::ClearReadErrorInjectOp> +ceph::io_exerciser ::ClearReadErrorInjectOp::generate( + int shard, const std::optional<uint64_t>& type) { + return std::make_unique<ClearReadErrorInjectOp>(shard, type); +} + +ceph::io_exerciser::ClearWriteErrorInjectOp::ClearWriteErrorInjectOp( + int shard, const std::optional<uint64_t>& type) + : ClearErrorInjectOp<OpType::ClearWriteErrorInject>(shard, type) {} + +std::unique_ptr<ceph::io_exerciser::ClearWriteErrorInjectOp> +ceph::io_exerciser ::ClearWriteErrorInjectOp::generate( + int shard, const std::optional<uint64_t>& type) { + return std::make_unique<ClearWriteErrorInjectOp>(shard, type); }
\ No newline at end of file diff --git a/src/common/io_exerciser/IoOp.h b/src/common/io_exerciser/IoOp.h index 60c02a93d4e..1887eafcc1f 100644 --- a/src/common/io_exerciser/IoOp.h +++ b/src/common/io_exerciser/IoOp.h @@ -1,94 +1,248 @@ #pragma once -#include <string> +#include <array> #include <memory> -#include "include/ceph_assert.h" +#include <optional> +#include <string> + +#include "OpType.h" /* Overview * - * enum OpType - * Enumeration of different types of I/O operation - * * class IoOp * Stores details for an I/O operation. Generated by IoSequences * and applied by IoExerciser's */ namespace ceph { - namespace io_exerciser { - - enum class OpType { - Done, // End of I/O sequence - BARRIER, // Barrier - all prior I/Os must complete - CREATE, // Create object and pattern with data - REMOVE, // Remove object - READ, // Read - READ2, // 2 Reads in one op - READ3, // 3 Reads in one op - WRITE, // Write - WRITE2, // 2 Writes in one op - WRITE3 // 3 Writes in one op - }; - - class IoOp { - protected: - std::string value_to_string(uint64_t v) const; - - public: - OpType op; - uint64_t offset1; - uint64_t length1; - uint64_t offset2; - uint64_t length2; - uint64_t offset3; - uint64_t length3; - - IoOp( OpType op, - uint64_t offset1 = 0, uint64_t length1 = 0, - uint64_t offset2 = 0, uint64_t length2 = 0, - uint64_t offset3 = 0, uint64_t length3 = 0 ); - - static std::unique_ptr<IoOp> generate_done(); - - static std::unique_ptr<IoOp> generate_barrier(); - - static std::unique_ptr<IoOp> generate_create(uint64_t size); - - static std::unique_ptr<IoOp> generate_remove(); - - static std::unique_ptr<IoOp> generate_read(uint64_t offset, +namespace io_exerciser { + +class IoOp { + public: + IoOp(); + virtual ~IoOp() = default; + virtual std::string to_string(uint64_t block_size) const = 0; + virtual constexpr OpType getOpType() const = 0; +}; + +template <OpType opType> +class TestOp : public IoOp { + public: + TestOp(); + constexpr OpType getOpType() const override { return opType; } +}; + +class DoneOp : public TestOp<OpType::Done> { + public: + DoneOp(); + static std::unique_ptr<DoneOp> generate(); + std::string to_string(uint64_t block_size) const override; +}; + +class BarrierOp : public TestOp<OpType::Barrier> { + public: + BarrierOp(); + static std::unique_ptr<BarrierOp> generate(); + std::string to_string(uint64_t block_size) const override; +}; + +class CreateOp : public TestOp<OpType::Create> { + public: + CreateOp(uint64_t size); + static std::unique_ptr<CreateOp> generate(uint64_t size); + std::string to_string(uint64_t block_size) const override; + uint64_t size; +}; + +class RemoveOp : public TestOp<OpType::Remove> { + public: + RemoveOp(); + static std::unique_ptr<RemoveOp> generate(); + std::string to_string(uint64_t block_size) const override; +}; + +template <OpType opType, int numIOs> +class ReadWriteOp : public TestOp<opType> { + public: + std::array<uint64_t, numIOs> offset; + std::array<uint64_t, numIOs> length; + + protected: + ReadWriteOp(std::array<uint64_t, numIOs>&& offset, + std::array<uint64_t, numIOs>&& length); + std::string to_string(uint64_t block_size) const override; +}; + +class SingleReadOp : public ReadWriteOp<OpType::Read, 1> { + public: + SingleReadOp(uint64_t offset, uint64_t length); + static std::unique_ptr<SingleReadOp> generate(uint64_t offset, + uint64_t length); +}; + +class DoubleReadOp : public ReadWriteOp<OpType::Read2, 2> { + public: + DoubleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2); + static std::unique_ptr<DoubleReadOp> generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2); +}; + +class TripleReadOp : public ReadWriteOp<OpType::Read3, 3> { + public: + TripleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3); + static std::unique_ptr<TripleReadOp> generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3); +}; + +class SingleWriteOp : public ReadWriteOp<OpType::Write, 1> { + public: + SingleWriteOp(uint64_t offset, uint64_t length); + static std::unique_ptr<SingleWriteOp> generate(uint64_t offset, uint64_t length); +}; + +class DoubleWriteOp : public ReadWriteOp<OpType::Write2, 2> { + public: + DoubleWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2); + static std::unique_ptr<DoubleWriteOp> generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2); +}; + +class TripleWriteOp : public ReadWriteOp<OpType::Write3, 3> { + public: + TripleWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3); + static std::unique_ptr<TripleWriteOp> generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3); +}; + +class SingleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite, 1> { + public: + SingleFailedWriteOp(uint64_t offset, uint64_t length); + static std::unique_ptr<SingleFailedWriteOp> generate(uint64_t offset, + uint64_t length); +}; + +class DoubleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite2, 2> { + public: + DoubleFailedWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2); + static std::unique_ptr<DoubleFailedWriteOp> generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2); +}; + +class TripleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite3, 3> { + public: + TripleFailedWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3); + static std::unique_ptr<TripleFailedWriteOp> generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3); +}; + +template <ceph::io_exerciser::OpType opType> +class InjectErrorOp : public TestOp<opType> { + public: + InjectErrorOp(int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + std::string to_string(uint64_t block_size) const override; + + int shard; + std::optional<uint64_t> type; + std::optional<uint64_t> when; + std::optional<uint64_t> duration; + + protected: + virtual inline constexpr std::string_view get_inject_type_string() const = 0; +}; + +class InjectReadErrorOp : public InjectErrorOp<OpType::InjectReadError> { + public: + InjectReadErrorOp(int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + static std::unique_ptr<InjectReadErrorOp> generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "read"; + } +}; + +class InjectWriteErrorOp : public InjectErrorOp<OpType::InjectWriteError> { + public: + InjectWriteErrorOp(int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + static std::unique_ptr<InjectWriteErrorOp> generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "write"; + } +}; + +template <ceph::io_exerciser::OpType opType> +class ClearErrorInjectOp : public TestOp<opType> { + public: + ClearErrorInjectOp(int shard, const std::optional<uint64_t>& type); + + std::string to_string(uint64_t block_size) const override; + + int shard; + std::optional<uint64_t> type; + + protected: + virtual inline constexpr std::string_view get_inject_type_string() const = 0; +}; + +class ClearReadErrorInjectOp + : public ClearErrorInjectOp<OpType::ClearReadErrorInject> { + public: + ClearReadErrorInjectOp(int shard, const std::optional<uint64_t>& type); + + static std::unique_ptr<ClearReadErrorInjectOp> generate( + int shard, const std::optional<uint64_t>& type); + + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "read"; + } +}; + +class ClearWriteErrorInjectOp + : public ClearErrorInjectOp<OpType::ClearWriteErrorInject> { + public: + ClearWriteErrorInjectOp(int shard, const std::optional<uint64_t>& type); + + static std::unique_ptr<ClearWriteErrorInjectOp> generate( + int shard, const std::optional<uint64_t>& type); - static std::unique_ptr<IoOp> generate_read2(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2); - - static std::unique_ptr<IoOp> generate_read3(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2, - uint64_t offset3, - uint64_t length3); - - static std::unique_ptr<IoOp> generate_write(uint64_t offset, - uint64_t length); - - static std::unique_ptr<IoOp> generate_write2(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2); - - static std::unique_ptr<IoOp> generate_write3(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2, - uint64_t offset3, - uint64_t length3); - - bool done(); - - std::string to_string(uint64_t block_size) const; - }; + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "write"; } -}
\ No newline at end of file +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/IoSequence.cc b/src/common/io_exerciser/IoSequence.cc index 4a7ca0593d1..83f1cc595a5 100644 --- a/src/common/io_exerciser/IoSequence.cc +++ b/src/common/io_exerciser/IoSequence.cc @@ -1,12 +1,12 @@ #include "IoSequence.h" +using IoOp = ceph::io_exerciser::IoOp; using Sequence = ceph::io_exerciser::Sequence; using IoSequence = ceph::io_exerciser::IoSequence; -std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& seq) -{ - switch (seq) - { +std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, + const Sequence& seq) { + switch (seq) { case Sequence::SEQUENCE_SEQ0: os << "SEQUENCE_SEQ0"; break; @@ -37,6 +37,9 @@ std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& s case Sequence::SEQUENCE_SEQ9: os << "SEQUENCE_SEQ9"; break; + case Sequence::SEQUENCE_SEQ10: + os << "SEQUENCE_SEQ10"; + break; case Sequence::SEQUENCE_END: os << "SEQUENCE_END"; break; @@ -44,19 +47,12 @@ std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& s return os; } -IoSequence::IoSequence(std::pair<int,int> obj_size_range, - int seed) : - min_obj_size(obj_size_range.first), max_obj_size(obj_size_range.second), - create(true), barrier(false), done(false), remove(false), - obj_size(min_obj_size), step(-1), seed(seed) -{ - rng.seed(seed); +bool IoSequence::is_supported(Sequence sequence) const { + return sequence != Sequence::SEQUENCE_SEQ10; } -std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s, - std::pair<int,int> obj_size_range, - int seed) -{ +std::unique_ptr<IoSequence> IoSequence::generate_sequence( + Sequence s, std::pair<int, int> obj_size_range, int seed) { switch (s) { case Sequence::SEQUENCE_SEQ0: return std::make_unique<Seq0>(obj_size_range, seed); @@ -78,24 +74,39 @@ std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s, return std::make_unique<Seq8>(obj_size_range, seed); case Sequence::SEQUENCE_SEQ9: return std::make_unique<Seq9>(obj_size_range, seed); + case Sequence::SEQUENCE_SEQ10: + ceph_abort_msg( + "Sequence 10 only supported for erasure coded pools " + "through the EcIoSequence interface"); + return nullptr; default: break; } return nullptr; } -int IoSequence::get_step() const -{ - return step; +IoSequence::IoSequence(std::pair<int, int> obj_size_range, int seed) + : min_obj_size(obj_size_range.first), + max_obj_size(obj_size_range.second), + create(true), + barrier(false), + done(false), + remove(false), + obj_size(min_obj_size), + step(-1), + seed(seed) { + rng.seed(seed); } -int IoSequence::get_seed() const -{ - return seed; +std::string ceph::io_exerciser::IoSequence::get_name_with_seqseed() const { + return get_name() + " (seqseed " + std::to_string(get_seed()) + ")"; } -void IoSequence::set_min_object_size(uint64_t size) -{ +int IoSequence::get_step() const { return step; } + +int IoSequence::get_seed() const { return seed; } + +void IoSequence::set_min_object_size(uint64_t size) { min_obj_size = size; if (obj_size < size) { obj_size = size; @@ -105,23 +116,20 @@ void IoSequence::set_min_object_size(uint64_t size) } } -void IoSequence::set_max_object_size(uint64_t size) -{ +void IoSequence::set_max_object_size(uint64_t size) { max_obj_size = size; if (obj_size > size) { done = true; } } -void IoSequence::select_random_object_size() -{ +void IoSequence::select_random_object_size() { if (max_obj_size != min_obj_size) { obj_size = min_obj_size + rng(max_obj_size - min_obj_size); } } -std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size() -{ +std::unique_ptr<IoOp> IoSequence::increment_object_size() { obj_size++; if (obj_size > max_obj_size) { done = true; @@ -129,106 +137,118 @@ std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size() create = true; barrier = true; remove = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } -std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::next() -{ +Sequence IoSequence::getNextSupportedSequenceId() const { + Sequence sequence = get_id(); + ++sequence; + for (; sequence < Sequence::SEQUENCE_END; ++sequence) { + if (is_supported(sequence)) { + return sequence; + } + } + + return Sequence::SEQUENCE_END; +} + +std::unique_ptr<IoOp> IoSequence::next() { step++; if (remove) { remove = false; - return IoOp::generate_remove(); + return RemoveOp::generate(); } if (barrier) { barrier = false; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } if (done) { - return IoOp::generate_done(); + return DoneOp::generate(); } if (create) { create = false; barrier = true; - return IoOp::generate_create(obj_size); + return CreateOp::generate(obj_size); } return _next(); } - - -ceph::io_exerciser::Seq0::Seq0(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0) -{ +ceph::io_exerciser::Seq0::Seq0(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset(0) { select_random_object_size(); length = 1 + rng(obj_size - 1); } -std::string ceph::io_exerciser::Seq0::get_name() const -{ +Sequence ceph::io_exerciser::Seq0::get_id() const { + return Sequence::SEQUENCE_SEQ0; +} + +std::string ceph::io_exerciser::Seq0::get_name() const { return "Sequential reads of length " + std::to_string(length) + - " with queue depth 1 (seqseed " + std::to_string(get_seed()) + ")"; + " with queue depth 1"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next() { std::unique_ptr<IoOp> r; if (offset >= obj_size) { done = true; barrier = true; remove = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } if (offset + length > obj_size) { - r = IoOp::generate_read(offset, obj_size - offset); + r = SingleReadOp::generate(offset, obj_size - offset); } else { - r = IoOp::generate_read(offset, length); + r = SingleReadOp::generate(offset, length); } offset += length; return r; } - - -ceph::io_exerciser::Seq1::Seq1(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed) -{ +ceph::io_exerciser::Seq1::Seq1(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed) { select_random_object_size(); count = 3 * obj_size; } -std::string ceph::io_exerciser::Seq1::get_name() const -{ - return "Random offset, random length read/write I/O with queue depth 1 (seqseed " - + std::to_string(get_seed()) + ")"; +Sequence ceph::io_exerciser::Seq1::get_id() const { + return Sequence::SEQUENCE_SEQ1; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next() -{ +std::string ceph::io_exerciser::Seq1::get_name() const { + return "Random offset, random length read/write I/O with queue depth 1"; +} + +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next() { barrier = true; if (count-- == 0) { done = true; remove = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } uint64_t offset = rng(obj_size - 1); uint64_t length = 1 + rng(obj_size - 1 - offset); - return (rng(2) != 0) ? IoOp::generate_write(offset, length) : - IoOp::generate_read(offset, length); -} + if (rng(2) != 0) { + return SingleWriteOp::generate(offset, length); + } else { + return SingleReadOp::generate(offset, length); + } +} +ceph::io_exerciser::Seq2::Seq2(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset(0), length(0) {} -ceph::io_exerciser::Seq2::Seq2(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(0) {} +Sequence ceph::io_exerciser::Seq2::get_id() const { + return Sequence::SEQUENCE_SEQ2; +} -std::string ceph::io_exerciser::Seq2::get_name() const -{ +std::string ceph::io_exerciser::Seq2::get_name() const { return "Permutations of offset and length read I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() { length++; if (length > obj_size - offset) { length = 1; @@ -239,24 +259,23 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() return increment_object_size(); } } - return IoOp::generate_read(offset, length); + return SingleReadOp::generate(offset, length); } - - -ceph::io_exerciser::Seq3::Seq3(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset1(0), offset2(0) -{ +ceph::io_exerciser::Seq3::Seq3(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset1(0), offset2(0) { set_min_object_size(2); } -std::string ceph::io_exerciser::Seq3::get_name() const -{ +Sequence ceph::io_exerciser::Seq3::get_id() const { + return Sequence::SEQUENCE_SEQ3; +} + +std::string ceph::io_exerciser::Seq3::get_name() const { return "Permutations of offset 2-region 1-block read I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() { offset2++; if (offset2 >= obj_size - offset1) { offset2 = 1; @@ -267,24 +286,23 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() return increment_object_size(); } } - return IoOp::generate_read2(offset1, 1, offset1 + offset2, 1); + return DoubleReadOp::generate(offset1, 1, offset1 + offset2, 1); } - - -ceph::io_exerciser::Seq4::Seq4(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset1(0), offset2(1) -{ +ceph::io_exerciser::Seq4::Seq4(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset1(0), offset2(1) { set_min_object_size(3); } -std::string ceph::io_exerciser::Seq4::get_name() const -{ +Sequence ceph::io_exerciser::Seq4::get_id() const { + return Sequence::SEQUENCE_SEQ4; +} + +std::string ceph::io_exerciser::Seq4::get_name() const { return "Permutations of offset 3-region 1-block read I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() { offset2++; if (offset2 >= obj_size - offset1) { offset2 = 2; @@ -295,33 +313,35 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() return increment_object_size(); } } - return IoOp::generate_read3(offset1, 1, - offset1 + offset2, 1, - (offset1 * 2 + offset2)/2, 1); + return TripleReadOp::generate(offset1, 1, (offset1 + offset2), 1, + (offset1 * 2 + offset2) / 2, 1); } +ceph::io_exerciser::Seq5::Seq5(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), + offset(0), + length(1), + doneread(false), + donebarrier(false) {} +Sequence ceph::io_exerciser::Seq5::get_id() const { + return Sequence::SEQUENCE_SEQ5; +} -ceph::io_exerciser::Seq5::Seq5(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(1), - doneread(false), donebarrier(false) {} - -std::string ceph::io_exerciser::Seq5::get_name() const -{ +std::string ceph::io_exerciser::Seq5::get_name() const { return "Permutation of length sequential writes"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() { if (offset >= obj_size) { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } doneread = false; donebarrier = false; @@ -333,33 +353,36 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() } } uint64_t io_len = (offset + length > obj_size) ? (obj_size - offset) : length; - std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len); + std::unique_ptr<IoOp> r = SingleWriteOp::generate(offset, io_len); offset += io_len; return r; } +ceph::io_exerciser::Seq6::Seq6(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), + offset(0), + length(1), + doneread(false), + donebarrier(false) {} +Sequence ceph::io_exerciser::Seq6::get_id() const { + return Sequence::SEQUENCE_SEQ6; +} -ceph::io_exerciser::Seq6::Seq6(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(1), - doneread(false), donebarrier(false) {} - -std::string ceph::io_exerciser::Seq6::get_name() const -{ +std::string ceph::io_exerciser::Seq6::get_name() const { return "Permutation of length sequential writes, different alignment"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() { if (offset >= obj_size) { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } doneread = false; donebarrier = false; @@ -374,74 +397,72 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() if (io_len == 0) { io_len = length; } - std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len); + std::unique_ptr<IoOp> r = SingleWriteOp::generate(offset, io_len); offset += io_len; return r; } - - -ceph::io_exerciser::Seq7::Seq7(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed) -{ +ceph::io_exerciser::Seq7::Seq7(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed) { set_min_object_size(2); offset = obj_size; } -std::string ceph::io_exerciser::Seq7::get_name() const -{ +Sequence ceph::io_exerciser::Seq7::get_id() const { + return Sequence::SEQUENCE_SEQ7; +} + +std::string ceph::io_exerciser::Seq7::get_name() const { return "Permutations of offset 2-region 1-block writes"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next() { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } if (offset == 0) { doneread = false; donebarrier = false; - offset = obj_size+1; + offset = obj_size + 1; return increment_object_size(); } offset--; - if (offset == obj_size/2) { + if (offset == obj_size / 2) { return _next(); } doneread = false; donebarrier = false; - return IoOp::generate_write2(offset, 1, obj_size/2, 1); + return DoubleReadOp::generate(offset, 1, obj_size / 2, 1); } - - -ceph::io_exerciser::Seq8::Seq8(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset1(0), offset2(1) -{ +ceph::io_exerciser::Seq8::Seq8(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset1(0), offset2(1) { set_min_object_size(3); } -std::string ceph::io_exerciser::Seq8::get_name() const -{ +Sequence ceph::io_exerciser::Seq8::get_id() const { + return Sequence::SEQUENCE_SEQ8; +} + +std::string ceph::io_exerciser::Seq8::get_name() const { return "Permutations of offset 3-region 1-block write I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } offset2++; if (offset2 >= obj_size - offset1) { @@ -455,34 +476,30 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() } doneread = false; donebarrier = false; - return IoOp::generate_write3(offset1, 1, - offset1 + offset2, 1, - (offset1 * 2 + offset2)/2, 1); + return TripleWriteOp::generate(offset1, 1, offset1 + offset2, 1, + (offset1 * 2 + offset2) / 2, 1); } +ceph::io_exerciser::Seq9::Seq9(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset(0), length(0) {} - -ceph::io_exerciser::Seq9::Seq9(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(0) -{ - +Sequence ceph::io_exerciser::Seq9::get_id() const { + return Sequence::SEQUENCE_SEQ9; } -std::string ceph::io_exerciser::Seq9::get_name() const -{ +std::string ceph::io_exerciser::Seq9::get_name() const { return "Permutations of offset and length write I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } length++; if (length > obj_size - offset) { @@ -496,5 +513,5 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() } doneread = false; donebarrier = false; - return IoOp::generate_write(offset, length); + return SingleWriteOp::generate(offset, length); }
\ No newline at end of file diff --git a/src/common/io_exerciser/IoSequence.h b/src/common/io_exerciser/IoSequence.h index 114ff76303f..b6c254cf096 100644 --- a/src/common/io_exerciser/IoSequence.h +++ b/src/common/io_exerciser/IoSequence.h @@ -3,7 +3,6 @@ #pragma once #include "IoOp.h" - #include "include/random.h" /* Overview @@ -29,195 +28,209 @@ */ namespace ceph { - namespace io_exerciser { - - enum class Sequence { - SEQUENCE_SEQ0, - SEQUENCE_SEQ1, - SEQUENCE_SEQ2, - SEQUENCE_SEQ3, - SEQUENCE_SEQ4, - SEQUENCE_SEQ5, - SEQUENCE_SEQ6, - SEQUENCE_SEQ7, - SEQUENCE_SEQ8, - SEQUENCE_SEQ9, - // - SEQUENCE_END, - SEQUENCE_BEGIN = SEQUENCE_SEQ0 - }; - - inline Sequence operator++( Sequence& s ) - { - return s = (Sequence)(((int)(s) + 1)); - } - - std::ostream& operator<<(std::ostream& os, const Sequence& seq); - - /* I/O Sequences */ - - class IoSequence { - public: - virtual ~IoSequence() = default; - - virtual std::string get_name() const = 0; - int get_step() const; - int get_seed() const; - - std::unique_ptr<IoOp> next(); - - static std::unique_ptr<IoSequence> - generate_sequence(Sequence s, std::pair<int,int> obj_size_range, int seed ); - - protected: - uint64_t min_obj_size; - uint64_t max_obj_size; - bool create; - bool barrier; - bool done; - bool remove; - uint64_t obj_size; - int step; - int seed; - ceph::util::random_number_generator<int> rng = - ceph::util::random_number_generator<int>(); - - IoSequence(std::pair<int,int> obj_size_range, int seed); - - virtual std::unique_ptr<IoOp> _next() = 0; - - void set_min_object_size(uint64_t size); - void set_max_object_size(uint64_t size); - void select_random_object_size(); - std::unique_ptr<IoOp> increment_object_size(); - - }; - - class Seq0: public IoSequence { - public: - Seq0(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - }; - - class Seq1: public IoSequence { - public: - Seq1(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next(); - - private: - int count; - }; - - class Seq2: public IoSequence { - public: - Seq2(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - }; - - class Seq3: public IoSequence { - public: - Seq3(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - private: - uint64_t offset1; - uint64_t offset2; - }; - - class Seq4: public IoSequence { - public: - Seq4(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset1; - uint64_t offset2; - }; - - class Seq5: public IoSequence { - public: - Seq5(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - bool doneread; - bool donebarrier; - }; - - class Seq6: public IoSequence { - public: - Seq6(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - bool doneread; - bool donebarrier; - }; - - class Seq7: public IoSequence { - public: - Seq7(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - bool doneread = true; - bool donebarrier = false; - }; - - class Seq8: public IoSequence { - public: - Seq8(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - private: - uint64_t offset1; - uint64_t offset2; - bool doneread = true; - bool donebarrier = false; - }; - - class Seq9: public IoSequence { - private: - uint64_t offset; - uint64_t length; - bool doneread = true; - bool donebarrier = false; - - public: - Seq9(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - - std::unique_ptr<IoOp> _next() override; - }; - } -}
\ No newline at end of file +namespace io_exerciser { + +enum class Sequence { + SEQUENCE_SEQ0, + SEQUENCE_SEQ1, + SEQUENCE_SEQ2, + SEQUENCE_SEQ3, + SEQUENCE_SEQ4, + SEQUENCE_SEQ5, + SEQUENCE_SEQ6, + SEQUENCE_SEQ7, + SEQUENCE_SEQ8, + SEQUENCE_SEQ9, + SEQUENCE_SEQ10, + + SEQUENCE_END, + SEQUENCE_BEGIN = SEQUENCE_SEQ0 +}; + +inline Sequence operator++(Sequence& s) { + return s = (Sequence)(((int)(s) + 1)); +} + +std::ostream& operator<<(std::ostream& os, const Sequence& seq); + +/* I/O Sequences */ + +class IoSequence { + public: + virtual ~IoSequence() = default; + + virtual Sequence get_id() const = 0; + virtual std::string get_name_with_seqseed() const; + virtual std::string get_name() const = 0; + int get_step() const; + int get_seed() const; + + virtual Sequence getNextSupportedSequenceId() const; + virtual std::unique_ptr<IoOp> next(); + + virtual bool is_supported(Sequence sequence) const; + static std::unique_ptr<IoSequence> generate_sequence( + Sequence s, std::pair<int, int> obj_size_range, int seed); + + protected: + uint64_t min_obj_size; + uint64_t max_obj_size; + bool create; + bool barrier; + bool done; + bool remove; + uint64_t obj_size; + int step; + int seed; + ceph::util::random_number_generator<int> rng = + ceph::util::random_number_generator<int>(); + + IoSequence(std::pair<int, int> obj_size_range, int seed); + + virtual std::unique_ptr<IoOp> _next() = 0; + + void set_min_object_size(uint64_t size); + void set_max_object_size(uint64_t size); + void select_random_object_size(); + std::unique_ptr<IoOp> increment_object_size(); +}; + +class Seq0 : public IoSequence { + public: + Seq0(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; +}; + +class Seq1 : public IoSequence { + public: + Seq1(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + int count; +}; + +class Seq2 : public IoSequence { + public: + Seq2(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; +}; + +class Seq3 : public IoSequence { + public: + Seq3(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset1; + uint64_t offset2; +}; + +class Seq4 : public IoSequence { + public: + Seq4(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset1; + uint64_t offset2; +}; + +class Seq5 : public IoSequence { + public: + Seq5(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; + bool doneread; + bool donebarrier; +}; + +class Seq6 : public IoSequence { + public: + Seq6(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; + bool doneread; + bool donebarrier; +}; + +class Seq7 : public IoSequence { + public: + Seq7(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + bool doneread = true; + bool donebarrier = false; +}; + +class Seq8 : public IoSequence { + public: + Seq8(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset1; + uint64_t offset2; + bool doneread = true; + bool donebarrier = false; +}; + +class Seq9 : public IoSequence { + private: + uint64_t offset; + uint64_t length; + bool doneread = true; + bool donebarrier = false; + + public: + Seq9(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/Model.cc b/src/common/io_exerciser/Model.cc index 50812ecbb15..6548e1eda7a 100644 --- a/src/common/io_exerciser/Model.cc +++ b/src/common/io_exerciser/Model.cc @@ -4,25 +4,11 @@ using Model = ceph::io_exerciser::Model; -Model::Model(const std::string& oid, uint64_t block_size) : -num_io(0), -oid(oid), -block_size(block_size) -{ +Model::Model(const std::string& oid, uint64_t block_size) + : num_io(0), oid(oid), block_size(block_size) {} -} +const uint64_t Model::get_block_size() const { return block_size; } -const uint64_t Model::get_block_size() const -{ - return block_size; -} +const std::string Model::get_oid() const { return oid; } -const std::string Model::get_oid() const -{ - return oid; -} - -int Model::get_num_io() const -{ - return num_io; -}
\ No newline at end of file +int Model::get_num_io() const { return num_io; }
\ No newline at end of file diff --git a/src/common/io_exerciser/Model.h b/src/common/io_exerciser/Model.h index 58d107409a6..9e421e79a78 100644 --- a/src/common/io_exerciser/Model.h +++ b/src/common/io_exerciser/Model.h @@ -1,15 +1,13 @@ #pragma once -#include "IoOp.h" - #include <boost/asio/io_context.hpp> -#include "librados/librados_asio.h" - -#include "include/interval_set.h" -#include "global/global_init.h" -#include "global/global_context.h" +#include "IoOp.h" #include "common/Thread.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/interval_set.h" +#include "librados/librados_asio.h" /* Overview * @@ -21,29 +19,27 @@ */ namespace ceph { - namespace io_exerciser { - - class Model - { - protected: - int num_io{0}; - std::string oid; - uint64_t block_size; - - public: - Model(const std::string& oid, uint64_t block_size); - virtual ~Model() = default; - - virtual bool readyForIoOp(IoOp& op) = 0; - virtual void applyIoOp(IoOp& op) = 0; - - const std::string get_oid() const; - const uint64_t get_block_size() const; - int get_num_io() const; - }; - - /* Simple RADOS I/O generator */ - - - } -}
\ No newline at end of file +namespace io_exerciser { + +class Model { + protected: + int num_io{0}; + std::string oid; + uint64_t block_size; + + public: + Model(const std::string& oid, uint64_t block_size); + virtual ~Model() = default; + + virtual bool readyForIoOp(IoOp& op) = 0; + virtual void applyIoOp(IoOp& op) = 0; + + const std::string get_oid() const; + const uint64_t get_block_size() const; + int get_num_io() const; +}; + +/* Simple RADOS I/O generator */ + +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/ObjectModel.cc b/src/common/io_exerciser/ObjectModel.cc index 589f6434282..454d7254cf2 100644 --- a/src/common/io_exerciser/ObjectModel.cc +++ b/src/common/io_exerciser/ObjectModel.cc @@ -6,25 +6,20 @@ using ObjectModel = ceph::io_exerciser::ObjectModel; -ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) : - Model(oid, block_size), created(false) -{ +ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) + : Model(oid, block_size), created(false) { rng.seed(seed); } -int ObjectModel::get_seed(uint64_t offset) const -{ +int ObjectModel::get_seed(uint64_t offset) const { ceph_assert(offset < contents.size()); return contents[offset]; } -std::vector<int> ObjectModel::get_seed_offsets(int seed) const -{ +std::vector<int> ObjectModel::get_seed_offsets(int seed) const { std::vector<int> offsets; - for (size_t i = 0; i < contents.size(); i++) - { - if (contents[i] == seed) - { + for (size_t i = 0; i < contents.size(); i++) { + if (contents[i] == seed) { offsets.push_back(i); } } @@ -32,8 +27,7 @@ std::vector<int> ObjectModel::get_seed_offsets(int seed) const return offsets; } -std::string ObjectModel::to_string(int mask) const -{ +std::string ObjectModel::to_string(int mask) const { if (!created) { return "Object does not exist"; } @@ -48,107 +42,127 @@ std::string ObjectModel::to_string(int mask) const return result; } -bool ObjectModel::readyForIoOp(IoOp& op) -{ - return true; -} - -void ObjectModel::applyIoOp(IoOp& op) -{ - auto generate_random = [&rng = rng]() { - return rng(); - }; - - switch (op.op) { - case OpType::BARRIER: - reads.clear(); - writes.clear(); - break; - - case OpType::CREATE: - ceph_assert(!created); - ceph_assert(reads.empty()); - ceph_assert(writes.empty()); - created = true; - contents.resize(op.length1); - std::generate(std::execution::seq, contents.begin(), contents.end(), - generate_random); - break; - - case OpType::REMOVE: - ceph_assert(created); - ceph_assert(reads.empty()); - ceph_assert(writes.empty()); - created = false; - contents.resize(0); - break; - - case OpType::READ3: - ceph_assert(created); - ceph_assert(op.offset3 + op.length3 <= contents.size()); - // Not allowed: read overlapping with parallel write - ceph_assert(!writes.intersects(op.offset3, op.length3)); - reads.union_insert(op.offset3, op.length3); - [[fallthrough]]; - - case OpType::READ2: - ceph_assert(created); - ceph_assert(op.offset2 + op.length2 <= contents.size()); - // Not allowed: read overlapping with parallel write - ceph_assert(!writes.intersects(op.offset2, op.length2)); - reads.union_insert(op.offset2, op.length2); - [[fallthrough]]; - - case OpType::READ: - ceph_assert(created); - ceph_assert(op.offset1 + op.length1 <= contents.size()); - // Not allowed: read overlapping with parallel write - ceph_assert(!writes.intersects(op.offset1, op.length1)); - reads.union_insert(op.offset1, op.length1); - num_io++; - break; - - case OpType::WRITE3: - ceph_assert(created); - // Not allowed: write overlapping with parallel read or write - ceph_assert(!reads.intersects(op.offset3, op.length3)); - ceph_assert(!writes.intersects(op.offset3, op.length3)); - writes.union_insert(op.offset3, op.length3); - ceph_assert(op.offset3 + op.length3 <= contents.size()); - std::generate(std::execution::seq, - std::next(contents.begin(), op.offset3), - std::next(contents.begin(), op.offset3 + op.length3), - generate_random); - [[fallthrough]]; - - case OpType::WRITE2: - ceph_assert(created); - // Not allowed: write overlapping with parallel read or write - ceph_assert(!reads.intersects(op.offset2, op.length2)); - ceph_assert(!writes.intersects(op.offset2, op.length2)); - writes.union_insert(op.offset2, op.length2); - ceph_assert(op.offset2 + op.length2 <= contents.size()); - std::generate(std::execution::seq, - std::next(contents.begin(), op.offset2), - std::next(contents.begin(), op.offset2 + op.length2), - generate_random); - [[fallthrough]]; - - case OpType::WRITE: - ceph_assert(created); - // Not allowed: write overlapping with parallel read or write - ceph_assert(!reads.intersects(op.offset1, op.length1)); - ceph_assert(!writes.intersects(op.offset1, op.length1)); - writes.union_insert(op.offset1, op.length1); - ceph_assert(op.offset1 + op.length1 <= contents.size()); - std::generate(std::execution::seq, - std::next(contents.begin(), op.offset1), - std::next(contents.begin(), op.offset1 + op.length1), - generate_random); - num_io++; - break; - default: - break; +bool ObjectModel::readyForIoOp(IoOp& op) { return true; } + +void ObjectModel::applyIoOp(IoOp& op) { + auto generate_random = [&rng = rng]() { return rng(); }; + + auto verify_and_record_read_op = + [&contents = contents, &created = created, &num_io = num_io, + &reads = reads, + &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N>& readOp) { + ceph_assert(created); + for (int i = 0; i < N; i++) { + ceph_assert(readOp.offset[i] + readOp.length[i] <= contents.size()); + // Not allowed: read overlapping with parallel write + ceph_assert(!writes.intersects(readOp.offset[i], readOp.length[i])); + reads.union_insert(readOp.offset[i], readOp.length[i]); + } + num_io++; + }; + + auto verify_write_and_record_and_generate_seed = + [&generate_random, &contents = contents, &created = created, + &num_io = num_io, &reads = reads, + &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N> writeOp) { + ceph_assert(created); + for (int i = 0; i < N; i++) { + // Not allowed: write overlapping with parallel read or write + ceph_assert(!reads.intersects(writeOp.offset[i], writeOp.length[i])); + ceph_assert(!writes.intersects(writeOp.offset[i], writeOp.length[i])); + writes.union_insert(writeOp.offset[i], writeOp.length[i]); + ceph_assert(writeOp.offset[i] + writeOp.length[i] <= contents.size()); + std::generate(std::execution::seq, + std::next(contents.begin(), writeOp.offset[i]), + std::next(contents.begin(), + writeOp.offset[i] + writeOp.length[i]), + generate_random); + } + num_io++; + }; + + auto verify_failed_write_and_record = + [&contents = contents, &created = created, &num_io = num_io, + &reads = reads, + &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N> writeOp) { + // Ensure write should still be valid, even though we are expecting OSD + // failure + ceph_assert(created); + for (int i = 0; i < N; i++) { + // Not allowed: write overlapping with parallel read or write + ceph_assert(!reads.intersects(writeOp.offset[i], writeOp.length[i])); + ceph_assert(!writes.intersects(writeOp.offset[i], writeOp.length[i])); + writes.union_insert(writeOp.offset[i], writeOp.length[i]); + ceph_assert(writeOp.offset[i] + writeOp.length[i] <= contents.size()); + } + num_io++; + }; + + switch (op.getOpType()) { + case OpType::Barrier: + reads.clear(); + writes.clear(); + break; + + case OpType::Create: + ceph_assert(!created); + ceph_assert(reads.empty()); + ceph_assert(writes.empty()); + created = true; + contents.resize(static_cast<CreateOp&>(op).size); + std::generate(std::execution::seq, contents.begin(), contents.end(), + generate_random); + break; + + case OpType::Remove: + ceph_assert(created); + ceph_assert(reads.empty()); + ceph_assert(writes.empty()); + created = false; + contents.resize(0); + break; + + case OpType::Read: { + SingleReadOp& readOp = static_cast<SingleReadOp&>(op); + verify_and_record_read_op(readOp); + } break; + case OpType::Read2: { + DoubleReadOp& readOp = static_cast<DoubleReadOp&>(op); + verify_and_record_read_op(readOp); + } break; + case OpType::Read3: { + TripleReadOp& readOp = static_cast<TripleReadOp&>(op); + verify_and_record_read_op(readOp); + } break; + + case OpType::Write: { + ceph_assert(created); + SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op); + verify_write_and_record_and_generate_seed(writeOp); + } break; + case OpType::Write2: { + DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op); + verify_write_and_record_and_generate_seed(writeOp); + } break; + case OpType::Write3: { + TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op); + verify_write_and_record_and_generate_seed(writeOp); + } break; + case OpType::FailedWrite: { + ceph_assert(created); + SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op); + verify_failed_write_and_record(writeOp); + } break; + case OpType::FailedWrite2: { + DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op); + verify_failed_write_and_record(writeOp); + } break; + case OpType::FailedWrite3: { + TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op); + verify_failed_write_and_record(writeOp); + } break; + default: + break; } } diff --git a/src/common/io_exerciser/ObjectModel.h b/src/common/io_exerciser/ObjectModel.h index 93c70f41429..cad1307b84e 100644 --- a/src/common/io_exerciser/ObjectModel.h +++ b/src/common/io_exerciser/ObjectModel.h @@ -14,40 +14,41 @@ */ namespace ceph { - namespace io_exerciser { - /* Model of an object to track its data contents */ - - class ObjectModel : public Model { - private: - bool created; - std::vector<int> contents; - ceph::util::random_number_generator<int> rng = - ceph::util::random_number_generator<int>(); - - // Track read and write I/Os that can be submitted in - // parallel to detect violations: - // - // * Read may not overlap with a parallel write - // * Write may not overlap with a parallel read or write - // * Create / remove may not be in parallel with read or write - // - // Fix broken test cases by adding barrier ops to restrict - // I/O exercisers from issuing conflicting ops in parallel - interval_set<uint64_t> reads; - interval_set<uint64_t> writes; - public: - ObjectModel(const std::string& oid, uint64_t block_size, int seed); - - int get_seed(uint64_t offset) const; - std::vector<int> get_seed_offsets(int seed) const; - - std::string to_string(int mask = -1) const; - - bool readyForIoOp(IoOp& op); - void applyIoOp(IoOp& op); - - void encode(ceph::buffer::list& bl) const; - void decode(ceph::buffer::list::const_iterator& bl); - }; - } -}
\ No newline at end of file +namespace io_exerciser { +/* Model of an object to track its data contents */ + +class ObjectModel : public Model { + private: + bool created; + std::vector<int> contents; + ceph::util::random_number_generator<int> rng = + ceph::util::random_number_generator<int>(); + + // Track read and write I/Os that can be submitted in + // parallel to detect violations: + // + // * Read may not overlap with a parallel write + // * Write may not overlap with a parallel read or write + // * Create / remove may not be in parallel with read or write + // + // Fix broken test cases by adding barrier ops to restrict + // I/O exercisers from issuing conflicting ops in parallel + interval_set<uint64_t> reads; + interval_set<uint64_t> writes; + + public: + ObjectModel(const std::string& oid, uint64_t block_size, int seed); + + int get_seed(uint64_t offset) const; + std::vector<int> get_seed_offsets(int seed) const; + + std::string to_string(int mask = -1) const; + + bool readyForIoOp(IoOp& op); + void applyIoOp(IoOp& op); + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/OpType.h b/src/common/io_exerciser/OpType.h new file mode 100644 index 00000000000..7cddb805e45 --- /dev/null +++ b/src/common/io_exerciser/OpType.h @@ -0,0 +1,91 @@ +#pragma once + +#include <fmt/format.h> +#include <include/ceph_assert.h> + +/* Overview + * + * enum OpType + * Enumeration of different types of I/O operation + * + */ + +namespace ceph { +namespace io_exerciser { +enum class OpType { + Done, // End of I/O sequence + Barrier, // Barrier - all prior I/Os must complete + Create, // Create object and pattern with data + Remove, // Remove object + Read, // Read + Read2, // Two reads in a single op + Read3, // Three reads in a single op + Write, // Write + Write2, // Two writes in a single op + Write3, // Three writes in a single op + FailedWrite, // A write which should fail + FailedWrite2, // Two writes in one op which should fail + FailedWrite3, // Three writes in one op which should fail + InjectReadError, // Op to tell OSD to inject read errors + InjectWriteError, // Op to tell OSD to inject write errors + ClearReadErrorInject, // Op to tell OSD to clear read error injects + ClearWriteErrorInject // Op to tell OSD to clear write error injects +}; + +enum class InjectOpType { + None, + ReadEIO, + ReadMissingShard, + WriteFailAndRollback, + WriteOSDAbort +}; +} // namespace io_exerciser +} // namespace ceph + +template <> +struct fmt::formatter<ceph::io_exerciser::OpType> { + constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } + + auto format(ceph::io_exerciser::OpType opType, + fmt::format_context& ctx) const -> fmt::format_context::iterator { + switch (opType) { + case ceph::io_exerciser::OpType::Done: + return fmt::format_to(ctx.out(), "Done"); + case ceph::io_exerciser::OpType::Barrier: + return fmt::format_to(ctx.out(), "Barrier"); + case ceph::io_exerciser::OpType::Create: + return fmt::format_to(ctx.out(), "Create"); + case ceph::io_exerciser::OpType::Remove: + return fmt::format_to(ctx.out(), "Remove"); + case ceph::io_exerciser::OpType::Read: + return fmt::format_to(ctx.out(), "Read"); + case ceph::io_exerciser::OpType::Read2: + return fmt::format_to(ctx.out(), "Read2"); + case ceph::io_exerciser::OpType::Read3: + return fmt::format_to(ctx.out(), "Read3"); + case ceph::io_exerciser::OpType::Write: + return fmt::format_to(ctx.out(), "Write"); + case ceph::io_exerciser::OpType::Write2: + return fmt::format_to(ctx.out(), "Write2"); + case ceph::io_exerciser::OpType::Write3: + return fmt::format_to(ctx.out(), "Write3"); + case ceph::io_exerciser::OpType::FailedWrite: + return fmt::format_to(ctx.out(), "FailedWrite"); + case ceph::io_exerciser::OpType::FailedWrite2: + return fmt::format_to(ctx.out(), "FailedWrite2"); + case ceph::io_exerciser::OpType::FailedWrite3: + return fmt::format_to(ctx.out(), "FailedWrite3"); + case ceph::io_exerciser::OpType::InjectReadError: + return fmt::format_to(ctx.out(), "InjectReadError"); + case ceph::io_exerciser::OpType::InjectWriteError: + return fmt::format_to(ctx.out(), "InjectWriteError"); + case ceph::io_exerciser::OpType::ClearReadErrorInject: + return fmt::format_to(ctx.out(), "ClearReadErrorInject"); + case ceph::io_exerciser::OpType::ClearWriteErrorInject: + return fmt::format_to(ctx.out(), "ClearWriteErrorInject"); + default: + ceph_abort_msg("Unknown OpType"); + return fmt::format_to(ctx.out(), "Unknown OpType"); + } + } +};
\ No newline at end of file diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc index 44b82260263..a78c074228b 100644 --- a/src/common/io_exerciser/RadosIo.cc +++ b/src/common/io_exerciser/RadosIo.cc @@ -1,300 +1,453 @@ #include "RadosIo.h" +#include <fmt/format.h> +#include <json_spirit/json_spirit.h> + +#include <ranges> + #include "DataGenerator.h" +#include "common/ceph_json.h" +#include "common/json/OSDStructures.h" using RadosIo = ceph::io_exerciser::RadosIo; -RadosIo::RadosIo(librados::Rados& rados, - boost::asio::io_context& asio, - const std::string& pool, - const std::string& oid, - uint64_t block_size, - int seed, - int threads, - ceph::mutex& lock, - ceph::condition_variable& cond) : - Model(oid, block_size), - rados(rados), - asio(asio), - om(std::make_unique<ObjectModel>(oid, block_size, seed)), - db(data_generation::DataGenerator::create_generator( - data_generation::GenerationType::HeaderedSeededRandom, *om)), - pool(pool), - threads(threads), - lock(lock), - cond(cond), - outstanding_io(0) -{ +namespace { +template <typename S> +int send_osd_command(int osd, S& s, librados::Rados& rados, const char* name, + ceph::buffer::list& inbl, ceph::buffer::list* outbl, + Formatter* f) { + encode_json(name, s, f); + + std::ostringstream oss; + f->flush(oss); + int rc = rados.osd_command(osd, oss.str(), inbl, outbl, nullptr); + return rc; +} + +template <typename S> +int send_mon_command(S& s, librados::Rados& rados, const char* name, + ceph::buffer::list& inbl, ceph::buffer::list* outbl, + Formatter* f) { + encode_json(name, s, f); + + std::ostringstream oss; + f->flush(oss); + int rc = rados.mon_command(oss.str(), inbl, outbl, nullptr); + return rc; +} +} // namespace + +RadosIo::RadosIo(librados::Rados& rados, boost::asio::io_context& asio, + const std::string& pool, const std::string& oid, + const std::optional<std::vector<int>>& cached_shard_order, + uint64_t block_size, int seed, int threads, ceph::mutex& lock, + ceph::condition_variable& cond) + : Model(oid, block_size), + rados(rados), + asio(asio), + om(std::make_unique<ObjectModel>(oid, block_size, seed)), + db(data_generation::DataGenerator::create_generator( + data_generation::GenerationType::HeaderedSeededRandom, *om)), + pool(pool), + cached_shard_order(cached_shard_order), + threads(threads), + lock(lock), + cond(cond), + outstanding_io(0) { int rc; rc = rados.ioctx_create(pool.c_str(), io); ceph_assert(rc == 0); allow_ec_overwrites(true); } -RadosIo::~RadosIo() -{ -} +RadosIo::~RadosIo() {} -void RadosIo::start_io() -{ +void RadosIo::start_io() { std::lock_guard l(lock); outstanding_io++; } -void RadosIo::finish_io() -{ +void RadosIo::finish_io() { std::lock_guard l(lock); ceph_assert(outstanding_io > 0); outstanding_io--; cond.notify_all(); } -void RadosIo::wait_for_io(int count) -{ +void RadosIo::wait_for_io(int count) { std::unique_lock l(lock); while (outstanding_io > count) { cond.wait(l); } } -void RadosIo::allow_ec_overwrites(bool allow) -{ +void RadosIo::allow_ec_overwrites(bool allow) { int rc; bufferlist inbl, outbl; - std::string cmdstr = - "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + "\", \ + std::string cmdstr = "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + + "\", \ \"var\": \"allow_ec_overwrites\", \"val\": \"" + - (allow ? "true" : "false") + "\"}"; + (allow ? "true" : "false") + "\"}"; rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr); ceph_assert(rc == 0); } -RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3 ) : - offset1(offset1), length1(length1), - offset2(offset2), length2(length2), - offset3(offset3), length3(length3) -{ +template <int N> +RadosIo::AsyncOpInfo<N>::AsyncOpInfo(const std::array<uint64_t, N>& offset, + const std::array<uint64_t, N>& length) + : offset(offset), length(length) {} -} - -bool RadosIo::readyForIoOp(IoOp &op) -{ - ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held +bool RadosIo::readyForIoOp(IoOp& op) { + ceph_assert( + ceph_mutex_is_locked_by_me(lock)); // Must be called with lock held if (!om->readyForIoOp(op)) { return false; } - switch (op.op) { - case OpType::Done: - case OpType::BARRIER: - return outstanding_io == 0; - default: - return outstanding_io < threads; + + switch (op.getOpType()) { + case OpType::Done: + case OpType::Barrier: + return outstanding_io == 0; + default: + return outstanding_io < threads; } } -void RadosIo::applyIoOp(IoOp &op) -{ - std::shared_ptr<AsyncOpInfo> op_info; - +void RadosIo::applyIoOp(IoOp& op) { om->applyIoOp(op); // If there are thread concurrent I/Os in flight then wait for // at least one I/O to complete - wait_for_io(threads-1); - - switch (op.op) { - case OpType::Done: - [[ fallthrough ]]; - case OpType::BARRIER: - // Wait for all outstanding I/O to complete - wait_for_io(0); - break; - - case OpType::CREATE: - { + wait_for_io(threads - 1); + + switch (op.getOpType()) { + case OpType::Done: + [[fallthrough]]; + case OpType::Barrier: + // Wait for all outstanding I/O to complete + wait_for_io(0); + break; + + case OpType::Create: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(0, op.length1); - op_info->bl1 = db->generate_data(0, op.length1); - op_info->wop.write_full(op_info->bl1); - auto create_cb = [this] (boost::system::error_code ec, - version_t ver) { + uint64_t opSize = static_cast<CreateOp&>(op).size; + std::shared_ptr<AsyncOpInfo<1>> op_info = + std::make_shared<AsyncOpInfo<1>>(std::array<uint64_t, 1>{0}, + std::array<uint64_t, 1>{opSize}); + op_info->bufferlist[0] = db->generate_data(0, opSize); + op_info->wop.write_full(op_info->bufferlist[0]); + auto create_cb = [this](boost::system::error_code ec, version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, create_cb); + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, + create_cb); + break; } - break; - case OpType::REMOVE: - { + case OpType::Remove: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(); + auto op_info = std::make_shared<AsyncOpInfo<0>>(); op_info->wop.remove(); - auto remove_cb = [this] (boost::system::error_code ec, - version_t ver) { + auto remove_cb = [this](boost::system::error_code ec, version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, remove_cb); + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, + remove_cb); + break; } - break; + case OpType::Read: + [[fallthrough]]; + case OpType::Read2: + [[fallthrough]]; + case OpType::Read3: + [[fallthrough]]; + case OpType::Write: + [[fallthrough]]; + case OpType::Write2: + [[fallthrough]]; + case OpType::Write3: + [[fallthrough]]; + case OpType::FailedWrite: + [[fallthrough]]; + case OpType::FailedWrite2: + [[fallthrough]]; + case OpType::FailedWrite3: + applyReadWriteOp(op); + break; + case OpType::InjectReadError: + [[fallthrough]]; + case OpType::InjectWriteError: + [[fallthrough]]; + case OpType::ClearReadErrorInject: + [[fallthrough]]; + case OpType::ClearWriteErrorInject: + applyInjectOp(op); + break; + default: + ceph_abort_msg("Unrecognised Op"); + break; + } +} - case OpType::READ: - { - start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1); - op_info->rop.read(op.offset1 * block_size, - op.length1 * block_size, - &op_info->bl1, nullptr); - auto read_cb = [this, op_info] (boost::system::error_code ec, - version_t ver, - bufferlist bl) { - ceph_assert(ec == boost::system::errc::success); - ceph_assert(db->validate(op_info->bl1, - op_info->offset1, - op_info->length1)); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->rop, 0, nullptr, read_cb); - num_io++; +void RadosIo::applyReadWriteOp(IoOp& op) { + auto applyReadOp = [this]<OpType opType, int N>( + ReadWriteOp<opType, N> readOp) { + auto op_info = + std::make_shared<AsyncOpInfo<N>>(readOp.offset, readOp.length); + + for (int i = 0; i < N; i++) { + op_info->rop.read(readOp.offset[i] * block_size, + readOp.length[i] * block_size, &op_info->bufferlist[i], + nullptr); } - break; + auto read_cb = [this, op_info](boost::system::error_code ec, version_t ver, + bufferlist bl) { + ceph_assert(ec == boost::system::errc::success); + for (int i = 0; i < N; i++) { + ceph_assert(db->validate(op_info->bufferlist[i], op_info->offset[i], + op_info->length[i])); + } + finish_io(); + }; + librados::async_operate(asio, io, oid, &op_info->rop, 0, nullptr, read_cb); + num_io++; + }; - case OpType::READ2: - { - start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, - op.length1, - op.offset2, - op.length2); - - op_info->rop.read(op.offset1 * block_size, - op.length1 * block_size, - &op_info->bl1, nullptr); - op_info->rop.read(op.offset2 * block_size, - op.length2 * block_size, - &op_info->bl2, nullptr); - auto read2_cb = [this, op_info] (boost::system::error_code ec, - version_t ver, - bufferlist bl) { - ceph_assert(ec == boost::system::errc::success); - ceph_assert(db->validate(op_info->bl1, - op_info->offset1, - op_info->length1)); - ceph_assert(db->validate(op_info->bl2, - op_info->offset2, - op_info->length2)); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->rop, 0, nullptr, read2_cb); - num_io++; + auto applyWriteOp = [this]<OpType opType, int N>( + ReadWriteOp<opType, N> writeOp) { + auto op_info = + std::make_shared<AsyncOpInfo<N>>(writeOp.offset, writeOp.length); + for (int i = 0; i < N; i++) { + op_info->bufferlist[i] = + db->generate_data(writeOp.offset[i], writeOp.length[i]); + op_info->wop.write(writeOp.offset[i] * block_size, + op_info->bufferlist[i]); } - break; + auto write_cb = [this](boost::system::error_code ec, version_t ver) { + ceph_assert(ec == boost::system::errc::success); + finish_io(); + }; + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, write_cb); + num_io++; + }; - case OpType::READ3: - { - start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1, - op.offset2, op.length2, - op.offset3, op.length3); - op_info->rop.read(op.offset1 * block_size, - op.length1 * block_size, - &op_info->bl1, nullptr); - op_info->rop.read(op.offset2 * block_size, - op.length2 * block_size, - &op_info->bl2, nullptr); - op_info->rop.read(op.offset3 * block_size, - op.length3 * block_size, - &op_info->bl3, nullptr); - auto read3_cb = [this, op_info] (boost::system::error_code ec, - version_t ver, - bufferlist bl) { - ceph_assert(ec == boost::system::errc::success); - ceph_assert(db->validate(op_info->bl1, - op_info->offset1, - op_info->length1)); - ceph_assert(db->validate(op_info->bl2, - op_info->offset2, - op_info->length2)); - ceph_assert(db->validate(op_info->bl3, - op_info->offset3, - op_info->length3)); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->rop, 0, nullptr, read3_cb); - num_io++; + auto applyFailedWriteOp = [this]<OpType opType, int N>( + ReadWriteOp<opType, N> writeOp) { + auto op_info = + std::make_shared<AsyncOpInfo<N>>(writeOp.offset, writeOp.length); + for (int i = 0; i < N; i++) { + op_info->bufferlist[i] = + db->generate_data(writeOp.offset[i], writeOp.length[i]); + op_info->wop.write(writeOp.offset[i] * block_size, + op_info->bufferlist[i]); } - break; + auto write_cb = [this, writeOp](boost::system::error_code ec, + version_t ver) { + ceph_assert(ec != boost::system::errc::success); + finish_io(); + }; + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, write_cb); + num_io++; + }; - case OpType::WRITE: - { + switch (op.getOpType()) { + case OpType::Read: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1); - op_info->bl1 = db->generate_data(op.offset1, op.length1); - - op_info->wop.write(op.offset1 * block_size, op_info->bl1); - auto write_cb = [this] (boost::system::error_code ec, - version_t ver) { - ceph_assert(ec == boost::system::errc::success); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, write_cb); - num_io++; + SingleReadOp& readOp = static_cast<SingleReadOp&>(op); + applyReadOp(readOp); + break; } - break; - - case OpType::WRITE2: - { + case OpType::Read2: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1, - op.offset2, op.length2); - op_info->bl1 = db->generate_data(op.offset1, op.length1); - op_info->bl2 = db->generate_data(op.offset2, op.length2); - op_info->wop.write(op.offset1 * block_size, op_info->bl1); - op_info->wop.write(op.offset2 * block_size, op_info->bl2); - auto write2_cb = [this] (boost::system::error_code ec, - version_t ver) { - ceph_assert(ec == boost::system::errc::success); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, write2_cb); - num_io++; + DoubleReadOp& readOp = static_cast<DoubleReadOp&>(op); + applyReadOp(readOp); + break; + } + case OpType::Read3: { + start_io(); + TripleReadOp& readOp = static_cast<TripleReadOp&>(op); + applyReadOp(readOp); + break; + } + case OpType::Write: { + start_io(); + SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op); + applyWriteOp(writeOp); + break; + } + case OpType::Write2: { + start_io(); + DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op); + applyWriteOp(writeOp); + break; + } + case OpType::Write3: { + start_io(); + TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op); + applyWriteOp(writeOp); + break; } - break; - case OpType::WRITE3: - { + case OpType::FailedWrite: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1, - op.offset2, op.length2, - op.offset3, op.length3); - op_info->bl1 = db->generate_data(op.offset1, op.length1); - op_info->bl2 = db->generate_data(op.offset2, op.length2); - op_info->bl3 = db->generate_data(op.offset3, op.length3); - op_info->wop.write(op.offset1 * block_size, op_info->bl1); - op_info->wop.write(op.offset2 * block_size, op_info->bl2); - op_info->wop.write(op.offset3 * block_size, op_info->bl3); - auto write3_cb = [this] (boost::system::error_code ec, - version_t ver) { - ceph_assert(ec == boost::system::errc::success); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, write3_cb); - num_io++; + SingleFailedWriteOp& writeOp = static_cast<SingleFailedWriteOp&>(op); + applyFailedWriteOp(writeOp); + break; + } + case OpType::FailedWrite2: { + start_io(); + DoubleFailedWriteOp& writeOp = static_cast<DoubleFailedWriteOp&>(op); + applyFailedWriteOp(writeOp); + break; + } + case OpType::FailedWrite3: { + start_io(); + TripleFailedWriteOp& writeOp = static_cast<TripleFailedWriteOp&>(op); + applyFailedWriteOp(writeOp); + break; } - break; - default: - break; + default: + ceph_abort_msg( + fmt::format("Unsupported Read/Write operation ({})", op.getOpType())); + break; } } + +void RadosIo::applyInjectOp(IoOp& op) { + bufferlist osdmap_inbl, inject_inbl, osdmap_outbl, inject_outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + int osd = -1; + std::vector<int> shard_order; + + ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, get_oid(), ""}; + int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", osdmap_inbl, + &osdmap_outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(osdmap_outbl.c_str(), osdmap_outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDMapReply reply; + reply.decode_json(&p); + + osd = reply.acting_primary; + shard_order = reply.acting; + + switch (op.getOpType()) { + case OpType::InjectReadError: { + InjectReadErrorOp& errorOp = static_cast<InjectReadErrorOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECErrorRequest<InjectOpType::ReadEIO> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + int rc = send_osd_command(osd, injectErrorRequest, rados, + "InjectECErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else if (errorOp.type == 1) { + ceph::messaging::osd::InjectECErrorRequest< + InjectOpType::ReadMissingShard> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + int rc = send_osd_command(osd, injectErrorRequest, rados, + "InjectECErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else { + ceph_abort_msg("Unsupported inject type"); + } + break; + } + case OpType::InjectWriteError: { + InjectWriteErrorOp& errorOp = static_cast<InjectWriteErrorOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECErrorRequest< + InjectOpType::WriteFailAndRollback> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + int rc = send_osd_command(osd, injectErrorRequest, rados, + "InjectECErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else if (errorOp.type == 3) { + ceph::messaging::osd::InjectECErrorRequest<InjectOpType::WriteOSDAbort> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + int rc = send_osd_command(osd, injectErrorRequest, rados, + "InjectECErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + + // This inject is sent directly to the shard we want to inject the error + // on + osd = shard_order[errorOp.shard]; + } else { + ceph_abort("Unsupported inject type"); + } + + break; + } + case OpType::ClearReadErrorInject: { + ClearReadErrorInjectOp& errorOp = + static_cast<ClearReadErrorInjectOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECClearErrorRequest<InjectOpType::ReadEIO> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + int rc = send_osd_command(osd, clearErrorInject, rados, + "InjectECClearErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else if (errorOp.type == 1) { + ceph::messaging::osd::InjectECClearErrorRequest< + InjectOpType::ReadMissingShard> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + int rc = send_osd_command(osd, clearErrorInject, rados, + "InjectECClearErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else { + ceph_abort("Unsupported inject type"); + } + + break; + } + case OpType::ClearWriteErrorInject: { + ClearReadErrorInjectOp& errorOp = + static_cast<ClearReadErrorInjectOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECClearErrorRequest< + InjectOpType::WriteFailAndRollback> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + int rc = send_osd_command(osd, clearErrorInject, rados, + "InjectECClearErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else if (errorOp.type == 3) { + ceph::messaging::osd::InjectECClearErrorRequest< + InjectOpType::WriteOSDAbort> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + int rc = send_osd_command(osd, clearErrorInject, rados, + "InjectECClearErrorRequest", inject_inbl, + &inject_outbl, formatter.get()); + ceph_assert(rc == 0); + } else { + ceph_abort("Unsupported inject type"); + } + + break; + } + default: + ceph_abort_msg( + fmt::format("Unsupported inject operation ({})", op.getOpType())); + break; + } +}
\ No newline at end of file diff --git a/src/common/io_exerciser/RadosIo.h b/src/common/io_exerciser/RadosIo.h index 179c5bba3ae..a5c66ad4768 100644 --- a/src/common/io_exerciser/RadosIo.h +++ b/src/common/io_exerciser/RadosIo.h @@ -10,71 +10,65 @@ * in the object. Uses DataBuffer to create and validate * data buffers. When there are not barrier I/Os this may * issue multiple async I/Os in parallel. - * + * */ namespace ceph { - namespace io_exerciser { - namespace data_generation { - class DataGenerator; - } - - class RadosIo: public Model { - protected: - librados::Rados& rados; - boost::asio::io_context& asio; - std::unique_ptr<ObjectModel> om; - std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db; - std::string pool; - int threads; - ceph::mutex& lock; - ceph::condition_variable& cond; - librados::IoCtx io; - int outstanding_io; +namespace io_exerciser { +namespace data_generation { +class DataGenerator; +} + +class RadosIo : public Model { + protected: + librados::Rados& rados; + boost::asio::io_context& asio; + std::unique_ptr<ObjectModel> om; + std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db; + std::string pool; + std::optional<std::vector<int>> cached_shard_order; + int threads; + ceph::mutex& lock; + ceph::condition_variable& cond; + librados::IoCtx io; + int outstanding_io; + + void start_io(); + void finish_io(); + void wait_for_io(int count); + + public: + RadosIo(librados::Rados& rados, boost::asio::io_context& asio, + const std::string& pool, const std::string& oid, + const std::optional<std::vector<int>>& cached_shard_order, + uint64_t block_size, int seed, int threads, ceph::mutex& lock, + ceph::condition_variable& cond); - void start_io(); - void finish_io(); - void wait_for_io(int count); - - public: - RadosIo(librados::Rados& rados, - boost::asio::io_context& asio, - const std::string& pool, - const std::string& oid, - uint64_t block_size, - int seed, - int threads, - ceph::mutex& lock, - ceph::condition_variable& cond); + ~RadosIo(); - ~RadosIo(); + void allow_ec_overwrites(bool allow); - void allow_ec_overwrites(bool allow); + template <int N> + class AsyncOpInfo { + public: + librados::ObjectReadOperation rop; + librados::ObjectWriteOperation wop; + std::array<ceph::bufferlist, N> bufferlist; + std::array<uint64_t, N> offset; + std::array<uint64_t, N> length; - class AsyncOpInfo { - public: - librados::ObjectReadOperation rop; - librados::ObjectWriteOperation wop; - ceph::buffer::list bl1; - ceph::buffer::list bl2; - ceph::buffer::list bl3; - uint64_t offset1; - uint64_t length1; - uint64_t offset2; - uint64_t length2; - uint64_t offset3; - uint64_t length3; + AsyncOpInfo(const std::array<uint64_t, N>& offset = {}, + const std::array<uint64_t, N>& length = {}); + ~AsyncOpInfo() = default; + }; - AsyncOpInfo(uint64_t offset1 = 0, uint64_t length1 = 0, - uint64_t offset2 = 0, uint64_t length2 = 0, - uint64_t offset3 = 0, uint64_t length3 = 0 ); - ~AsyncOpInfo() = default; - }; + // Must be called with lock held + bool readyForIoOp(IoOp& op); + void applyIoOp(IoOp& op); - // Must be called with lock held - bool readyForIoOp(IoOp& op); - - void applyIoOp(IoOp& op); - }; - } -}
\ No newline at end of file + private: + void applyReadWriteOp(IoOp& op); + void applyInjectOp(IoOp& op); +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/json/BalancerStructures.cc b/src/common/json/BalancerStructures.cc new file mode 100644 index 00000000000..48dfb843761 --- /dev/null +++ b/src/common/json/BalancerStructures.cc @@ -0,0 +1,38 @@ +#include "BalancerStructures.h" + +#include "common/ceph_json.h" + +using namespace ceph::messaging::balancer; + +void BalancerOffRequest::dump(Formatter* f) const { + encode_json("prefix", "balancer off", f); +} + +void BalancerOffRequest::decode_json(JSONObj* obj) {} + +void BalancerStatusRequest::dump(Formatter* f) const { + encode_json("prefix", "balancer status", f); +} + +void BalancerStatusRequest::decode_json(JSONObj* obj) {} + +void BalancerStatusReply::dump(Formatter* f) const { + encode_json("active", active, f); + encode_json("last_optimization_duration", last_optimization_duration, f); + encode_json("last_optimization_started", last_optimization_started, f); + encode_json("mode", mode, f); + encode_json("no_optimization_needed", no_optimization_needed, f); + encode_json("optimize_result", optimize_result, f); +} + +void BalancerStatusReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("active", active, obj); + JSONDecoder::decode_json("last_optimization_duration", + last_optimization_duration, obj); + JSONDecoder::decode_json("last_optimization_started", + last_optimization_started, obj); + JSONDecoder::decode_json("mode", mode, obj); + JSONDecoder::decode_json("no_optimization_needed", no_optimization_needed, + obj); + JSONDecoder::decode_json("optimize_result", optimize_result, obj); +}
\ No newline at end of file diff --git a/src/common/json/BalancerStructures.h b/src/common/json/BalancerStructures.h new file mode 100644 index 00000000000..bbf5c748eb3 --- /dev/null +++ b/src/common/json/BalancerStructures.h @@ -0,0 +1,35 @@ +#pragma once + +#include <string> + +#include "include/types.h" + +class JSONObj; + +namespace ceph { +namespace messaging { +namespace balancer { +struct BalancerOffRequest { + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct BalancerStatusRequest { + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct BalancerStatusReply { + bool active; + std::string last_optimization_duration; + std::string last_optimization_started; + std::string mode; + bool no_optimization_needed; + std::string optimize_result; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; +} // namespace balancer +} // namespace messaging +} // namespace ceph
\ No newline at end of file diff --git a/src/common/json/CMakeLists.txt b/src/common/json/CMakeLists.txt new file mode 100644 index 00000000000..1497daf93db --- /dev/null +++ b/src/common/json/CMakeLists.txt @@ -0,0 +1,4 @@ +add_library(json_structures STATIC + BalancerStructures.cc ConfigStructures.cc OSDStructures.cc) + + target_link_libraries(json_structures global)
\ No newline at end of file diff --git a/src/common/json/ConfigStructures.cc b/src/common/json/ConfigStructures.cc new file mode 100644 index 00000000000..651278d002a --- /dev/null +++ b/src/common/json/ConfigStructures.cc @@ -0,0 +1,20 @@ +#include "ConfigStructures.h" + +#include "common/ceph_json.h" + +using namespace ceph::messaging::config; + +void ConfigSetRequest::dump(Formatter* f) const { + encode_json("prefix", "config set", f); + encode_json("who", who, f); + encode_json("name", name, f); + encode_json("value", value, f); + encode_json("force", force, f); +} + +void ConfigSetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("who", who, obj); + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("value", value, obj); + JSONDecoder::decode_json("force", force, obj); +}
\ No newline at end of file diff --git a/src/common/json/ConfigStructures.h b/src/common/json/ConfigStructures.h new file mode 100644 index 00000000000..554229d75f4 --- /dev/null +++ b/src/common/json/ConfigStructures.h @@ -0,0 +1,24 @@ +#pragma once + +#include <optional> +#include <string> + +#include "include/types.h" + +class JSONObj; + +namespace ceph { +namespace messaging { +namespace config { +struct ConfigSetRequest { + std::string who; + std::string name; + std::string value; + std::optional<bool> force; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; +} // namespace config +} // namespace messaging +} // namespace ceph
\ No newline at end of file diff --git a/src/common/json/OSDStructures.cc b/src/common/json/OSDStructures.cc new file mode 100644 index 00000000000..aaac5f6e169 --- /dev/null +++ b/src/common/json/OSDStructures.cc @@ -0,0 +1,150 @@ +#include "OSDStructures.h" + +#include "common/ceph_json.h" +#include "common/io_exerciser/OpType.h" + +using namespace ceph::messaging::osd; + +void OSDMapRequest::dump(Formatter* f) const { + encode_json("prefix", "osd map", f); + encode_json("pool", pool, f); + encode_json("object", object, f); + encode_json("nspace", nspace, f); + encode_json("format", format, f); +} + +void OSDMapRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("object", object, obj); + JSONDecoder::decode_json("nspace", nspace, obj); + JSONDecoder::decode_json("format", format, obj); +} + +void OSDMapReply::dump(Formatter* f) const { + encode_json("epoch", epoch, f); + encode_json("pool", pool, f); + encode_json("pool_id", pool_id, f); + encode_json("objname", objname, f); + encode_json("raw_pgid", raw_pgid, f); + encode_json("pgid", pgid, f); + encode_json("up", up, f); + encode_json("up_primary", up_primary, f); + encode_json("acting", acting, f); + encode_json("acting_primary", acting_primary, f); +} + +void OSDMapReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("epoch", epoch, obj); + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("pool_id", pool_id, obj); + JSONDecoder::decode_json("objname", objname, obj); + JSONDecoder::decode_json("raw_pgid", raw_pgid, obj); + JSONDecoder::decode_json("pgid", pgid, obj); + JSONDecoder::decode_json("up", up, obj); + JSONDecoder::decode_json("up_primary", up_primary, obj); + JSONDecoder::decode_json("acting", acting, obj); + JSONDecoder::decode_json("acting_primary", acting_primary, obj); +} + +void OSDPoolGetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd pool get", f); + encode_json("pool", pool, f); + encode_json("var", var, f); + encode_json("format", format, f); +} + +void OSDPoolGetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("var", var, obj); + JSONDecoder::decode_json("format", format, obj); +} + +void OSDPoolGetReply::dump(Formatter* f) const { + encode_json("erasure_code_profile", erasure_code_profile, f); +} + +void OSDPoolGetReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("erasure_code_profile", erasure_code_profile, obj); +} + +void OSDECProfileGetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd pool get", f); + encode_json("name", name, f); + encode_json("format", format, f); +} + +void OSDECProfileGetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("format", format, obj); +} + +void OSDECProfileGetReply::dump(Formatter* f) const { + encode_json("crush-device-class", crush_device_class, f); + encode_json("crush-failure-domain", crush_failure_domain, f); + encode_json("crush-num-failure-domains", crush_num_failure_domains, f); + encode_json("crush-osds-per-failure-domain", crush_osds_per_failure_domain, + f); + encode_json("crush-root", crush_root, f); + encode_json("jerasure-per-chunk-alignment", jerasure_per_chunk_alignment, f); + encode_json("k", k, f); + encode_json("m", m, f); + encode_json("plugin", plugin, f); + encode_json("technique", technique, f); + encode_json("w", w, f); +} + +void OSDECProfileGetReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("crush-device-class", crush_device_class, obj); + JSONDecoder::decode_json("crush-failure-domain", crush_failure_domain, obj); + JSONDecoder::decode_json("crush-num-failure-domains", + crush_num_failure_domains, obj); + JSONDecoder::decode_json("crush-osds-per-failure-domain", + crush_osds_per_failure_domain, obj); + JSONDecoder::decode_json("crush-root", crush_root, obj); + JSONDecoder::decode_json("jerasure-per-chunk-alignment", + jerasure_per_chunk_alignment, obj); + JSONDecoder::decode_json("k", k, obj); + JSONDecoder::decode_json("m", m, obj); + JSONDecoder::decode_json("plugin", plugin, obj); + JSONDecoder::decode_json("technique", technique, obj); + JSONDecoder::decode_json("w", w, obj); +} + +void OSDECProfileSetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd erasure-code-profile set", f); + encode_json("name", name, f); + encode_json("profile", profile, f); +} + +void OSDECProfileSetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("profile", profile, obj); +} + +void OSDECPoolCreateRequest::dump(Formatter* f) const { + encode_json("prefix", "osd pool create", f); + encode_json("pool", pool, f); + encode_json("pool_type", pool_type, f); + encode_json("pg_num", pg_num, f); + encode_json("pgp_num", pgp_num, f); + encode_json("erasure_code_profile", erasure_code_profile, f); +} + +void OSDECPoolCreateRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("pool_type", pool_type, obj); + JSONDecoder::decode_json("pg_num", pg_num, obj); + JSONDecoder::decode_json("pgp_num", pgp_num, obj); + JSONDecoder::decode_json("erasure_code_profile", erasure_code_profile, obj); +} + +void OSDSetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd set", f); + encode_json("key", key, f); + encode_json("yes_i_really_mean_it", yes_i_really_mean_it, f); +} + +void OSDSetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("yes_i_really_mean_it", yes_i_really_mean_it, obj); +}
\ No newline at end of file diff --git a/src/common/json/OSDStructures.h b/src/common/json/OSDStructures.h new file mode 100644 index 00000000000..3e4528a099f --- /dev/null +++ b/src/common/json/OSDStructures.h @@ -0,0 +1,189 @@ +#pragma once + +#include <memory> +#include <string> +#include <vector> + +#include "common/ceph_json.h" +#include "common/io_exerciser/OpType.h" +#include "include/types.h" + +class JSONObj; + +namespace ceph { +namespace messaging { +namespace osd { +struct OSDMapRequest { + std::string pool; + std::string object; + std::string nspace; + std::string format = "json"; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDMapReply { + epoch_t epoch; + std::string pool; + uint64_t pool_id; + std::string objname; + std::string raw_pgid; + std::string pgid; + std::vector<int> up; + int up_primary; + std::vector<int> acting; + int acting_primary; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDPoolGetRequest { + std::string pool; + std::string var = "erasure_code_profile"; + std::string format = "json"; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDPoolGetReply { + std::string erasure_code_profile; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECProfileGetRequest { + std::string name; + std::string format = "json"; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECProfileGetReply { + std::string crush_device_class; + std::string crush_failure_domain; + int crush_num_failure_domains; + int crush_osds_per_failure_domain; + std::string crush_root; + bool jerasure_per_chunk_alignment; + int k; + int m; + std::string plugin; + std::string technique; + std::string w; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECProfileSetRequest { + std::string name; + std::vector<std::string> profile; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECPoolCreateRequest { + std::string pool; + std::string pool_type; + int pg_num; + int pgp_num; + std::string erasure_code_profile; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDSetRequest { + std::string key; + std::optional<bool> yes_i_really_mean_it = std::nullopt; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +// These structures are sent directly to the relevant OSD +// rather than the monitor +template <io_exerciser::InjectOpType op_type> +struct InjectECErrorRequest { + std::string pool; + std::string objname; + int shardid; + std::optional<uint64_t> type; + std::optional<uint64_t> when; + std::optional<uint64_t> duration; + + void dump(Formatter* f) const { + switch (op_type) { + case io_exerciser::InjectOpType::ReadEIO: + [[fallthrough]]; + case io_exerciser::InjectOpType::ReadMissingShard: + ::encode_json("prefix", "injectecreaderr", f); + break; + case io_exerciser::InjectOpType::WriteFailAndRollback: + [[fallthrough]]; + case io_exerciser::InjectOpType::WriteOSDAbort: + ::encode_json("prefix", "injectecwriteerr", f); + break; + default: + ceph_abort_msg("Unsupported Inject Type"); + } + ::encode_json("pool", pool, f); + ::encode_json("objname", objname, f); + ::encode_json("shardid", shardid, f); + ::encode_json("type", type, f); + ::encode_json("when", when, f); + ::encode_json("duration", duration, f); + } + void decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("objname", objname, obj); + JSONDecoder::decode_json("shardid", shardid, obj); + JSONDecoder::decode_json("type", type, obj); + JSONDecoder::decode_json("when", when, obj); + JSONDecoder::decode_json("duration", duration, obj); + } +}; + +template <io_exerciser::InjectOpType op_type> +struct InjectECClearErrorRequest { + std::string pool; + std::string objname; + int shardid; + std::optional<uint64_t> type; + + void dump(Formatter* f) const { + switch (op_type) { + case io_exerciser::InjectOpType::ReadEIO: + [[fallthrough]]; + case io_exerciser::InjectOpType::ReadMissingShard: + ::encode_json("prefix", "injectecclearreaderr", f); + break; + case io_exerciser::InjectOpType::WriteFailAndRollback: + [[fallthrough]]; + case io_exerciser::InjectOpType::WriteOSDAbort: + ::encode_json("prefix", "injectecclearwriteerr", f); + break; + default: + ceph_abort_msg("Unsupported Inject Type"); + } + ::encode_json("pool", pool, f); + ::encode_json("objname", objname, f); + ::encode_json("shardid", shardid, f); + ::encode_json("type", type, f); + } + void decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("objname", objname, obj); + JSONDecoder::decode_json("shardid", shardid, obj); + JSONDecoder::decode_json("type", type, obj); + } +}; +} // namespace osd +} // namespace messaging +} // namespace ceph
\ No newline at end of file diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in index 69b3a615576..132a4a09e89 100644 --- a/src/common/options/crimson.yaml.in +++ b/src/common/options/crimson.yaml.in @@ -2,6 +2,17 @@ --- options: +- name: crimson_osd_objectstore + type: str + level: advanced + desc: backend type for a Crimson OSD (e.g seastore or bluestore) + default: bluestore + enum_values: + - bluestore + - seastore + - cyanstore + flags: + - create - name: crimson_osd_obc_lru_size type: uint level: advanced diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 94824faef6b..03a53cd7cea 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1713,6 +1713,12 @@ options: default: 1000 services: - mds +- name: mds_delay_journal_replay_for_testing + type: millisecs + level: dev + desc: Delay the journal replay to verify the replay time estimate + long_desc: Jorunal replay warning is activated if the mds has been in replay state for more than 30 seconds. This config delays replay for validating the replay warning in tests. + default: 0 flags: - runtime - name: mds_server_dispatch_killpoint_random @@ -1739,4 +1745,4 @@ options: default: 16 services: - mds - min: 8 + min: 4 diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index ab1634bc154..1307030e3fb 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -91,6 +91,13 @@ options: default: 1000 services: - mon +- name: mon_nvmeofgw_delete_grace + type: secs + level: advanced + desc: Issue NVMEOF_GATEWAY_DELETING health warning after this amount of time has elapsed + default: 15_min + services: + - mon - name: mon_mgr_inactive_grace type: int level: advanced diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc index d125d7171e0..a0629a15686 100644 --- a/src/common/pick_address.cc +++ b/src/common/pick_address.cc @@ -642,17 +642,24 @@ int get_iface_numa_node( bool is_addr_in_subnet( CephContext *cct, const std::string &networks, - const std::string &addr) + const entity_addr_t &addr) { const auto nets = get_str_list(networks); ceph_assert(!nets.empty()); - unsigned ipv = CEPH_PICK_ADDRESS_IPV4; - struct sockaddr_in public_addr; - public_addr.sin_family = AF_INET; - - if(inet_pton(AF_INET, addr.c_str(), &public_addr.sin_addr) != 1) { - lderr(cct) << "unable to convert chosen address to string: " << addr << dendl; + struct sockaddr_in6 public_addr6; + struct sockaddr_in public_addr4; + + if (addr.is_ipv4() && + inet_pton(AF_INET, addr.ip_only_to_str().c_str(), &public_addr4.sin_addr) == 1) { + public_addr4.sin_family = AF_INET; + } else if (addr.is_ipv6() && + inet_pton(AF_INET6, addr.ip_only_to_str().c_str(), &public_addr6.sin6_addr) == 1) { + public_addr6.sin6_family = AF_INET6; + ipv = CEPH_PICK_ADDRESS_IPV6; + } else { + std::string_view addr_type = addr.is_ipv4() ? "IPv4" : "IPv6"; + lderr(cct) << "IP address " << addr << " is not parseable as " << addr_type << dendl; return false; } @@ -660,10 +667,16 @@ bool is_addr_in_subnet( struct ifaddrs ifa; memset(&ifa, 0, sizeof(ifa)); ifa.ifa_next = nullptr; - ifa.ifa_addr = (struct sockaddr*)&public_addr; + if (addr.is_ipv4()) { + ifa.ifa_addr = (struct sockaddr*)&public_addr4; + } else if (addr.is_ipv6()) { + ifa.ifa_addr = (struct sockaddr*)&public_addr6; + } + if(matches_with_net(cct, ifa, net, ipv)) { return true; } } + lderr(cct) << "address " << addr << " is not in networks '" << networks << "'" << dendl; return false; } diff --git a/src/common/pick_address.h b/src/common/pick_address.h index 40575d7d155..c28a6037ded 100644 --- a/src/common/pick_address.h +++ b/src/common/pick_address.h @@ -98,6 +98,6 @@ int get_iface_numa_node( bool is_addr_in_subnet( CephContext *cct, const std::string &networks, - const std::string &addr); + const entity_addr_t &addr); #endif diff --git a/src/compressor/lz4/LZ4Compressor.cc b/src/compressor/lz4/LZ4Compressor.cc index a209a5ac149..1504a2fe65d 100644 --- a/src/compressor/lz4/LZ4Compressor.cc +++ b/src/compressor/lz4/LZ4Compressor.cc @@ -121,16 +121,12 @@ int LZ4Compressor::decompress(ceph::buffer::list::const_iterator &p, LZ4_streamDecode_t lz4_stream_decode; LZ4_setStreamDecode(&lz4_stream_decode, nullptr, 0); - ceph::buffer::ptr cur_ptr = p.get_current_ptr(); - ceph::buffer::ptr *ptr = &cur_ptr; - std::optional<ceph::buffer::ptr> data_holder; - if (compressed_len != cur_ptr.length()) { - data_holder.emplace(compressed_len); - p.copy_deep(compressed_len, *data_holder); - ptr = &*data_holder; - } - - char *c_in = ptr->c_str(); + ceph::buffer::list indata; + // this does a shallow copy + p.copy(compressed_len, indata); + // if the input isn't fragmented, c_str() costs almost nothing. + // otherwise rectifying copy will be taken + const char* c_in = indata.c_str(); char *c_out = dstptr.c_str(); for (unsigned i = 0; i < count; ++i) { int r = LZ4_decompress_safe_continue( diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h index 92d99d332c4..0d73658e709 100644 --- a/src/crimson/common/shared_lru.h +++ b/src/crimson/common/shared_lru.h @@ -25,12 +25,17 @@ class SharedLRU { SimpleLRU<K, shared_ptr_t, false> cache; std::map<K, std::pair<weak_ptr_t, V*>> weak_refs; + // Once all of the shared pointers are destoryed, + // erase the tracked object from the weak_ref map + // before actually destorying it struct Deleter { - SharedLRU<K,V>* cache; + SharedLRU<K,V>* shared_lru_ptr; const K key; - void operator()(V* ptr) { - cache->_erase_weak(key); - delete ptr; + void operator()(V* value_ptr) { + if (shared_lru_ptr) { + shared_lru_ptr->_erase_weak(key); + } + delete value_ptr; } }; void _erase_weak(const K& key) { @@ -42,9 +47,19 @@ public: {} ~SharedLRU() { cache.clear(); + // initially, we were assuming that no pointer obtained from SharedLRU // can outlive the lru itself. However, since going with the interruption // concept for handling shutdowns, this is no longer valid. + // Moreover, before clearing weak_refs, invalidate each deleter + // cache pointer as this SharedLRU is being destoryed. + for (const auto& [key, value] : weak_refs) { + shared_ptr_t val; + val = value.first.lock(); + auto this_deleter = get_deleter<Deleter>(val); + this_deleter->shared_lru_ptr = nullptr; + } + weak_refs.clear(); } /** diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc index f390823a8a0..db6decd84f9 100644 --- a/src/crimson/os/alienstore/alien_store.cc +++ b/src/crimson/os/alienstore/alien_store.cc @@ -141,7 +141,8 @@ seastar::future<> AlienStore::stop() AlienStore::base_errorator::future<bool> AlienStore::exists( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { return op_gates.simple_dispatch("exists", [=, this] { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] { @@ -212,7 +213,8 @@ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> AlienStore::list_objects(CollectionRef ch, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const + uint64_t limit, + uint32_t op_flags) const { logger().debug("{}", __func__); assert(tp); @@ -348,7 +350,8 @@ AlienStore::readv(CollectionRef ch, AlienStore::get_attr_errorator::future<ceph::bufferlist> AlienStore::get_attr(CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { logger().debug("{}", __func__); assert(tp); @@ -376,7 +379,8 @@ AlienStore::get_attr(CollectionRef ch, AlienStore::get_attrs_ertr::future<AlienStore::attrs_t> AlienStore::get_attrs(CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { logger().debug("{}", __func__); assert(tp); @@ -397,7 +401,8 @@ AlienStore::get_attrs(CollectionRef ch, auto AlienStore::omap_get_values(CollectionRef ch, const ghobject_t& oid, - const set<string>& keys) + const set<string>& keys, + uint32_t op_flags) -> read_errorator::future<omap_values_t> { logger().debug("{}", __func__); @@ -421,7 +426,8 @@ auto AlienStore::omap_get_values(CollectionRef ch, auto AlienStore::omap_get_values(CollectionRef ch, const ghobject_t &oid, - const std::optional<string> &start) + const std::optional<string> &start, + uint32_t op_flags) -> read_errorator::future<std::tuple<bool, omap_values_t>> { logger().debug("{} with_start", __func__); @@ -429,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch, return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] { auto c = static_cast<AlienCollection*>(ch.get()); - return store->omap_get_values(c->collection, oid, start, - reinterpret_cast<map<string, bufferlist>*>(&values)); + return store->omap_iterate( + c->collection, oid, + ObjectStore::omap_iter_seek_t{ + .seek_position = start.value_or(std::string{}), + // FIXME: classical OSDs begins iteration from LOWER_BOUND + // (or UPPER_BOUND if filter_prefix > start). However, these + // bits are not implemented yet + .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND + }, + [&values] + (std::string_view key, std::string_view value) mutable { + values[std::string{key}].append(value); + // FIXME: there is limit on number of entries yet + return ObjectStore::omap_iter_ret_t::NEXT; + }); }).then([&values] (int r) -> read_errorator::future<std::tuple<bool, omap_values_t>> { if (r == -ENOENT) { @@ -578,7 +597,8 @@ unsigned AlienStore::get_max_attr_name_length() const seastar::future<struct stat> AlienStore::stat( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { assert(tp); return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) { @@ -604,7 +624,8 @@ seastar::future<std::string> AlienStore::get_default_device_class() } auto AlienStore::omap_get_header(CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) -> get_attr_errorator::future<ceph::bufferlist> { assert(tp); @@ -630,7 +651,8 @@ AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fie CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { assert(tp); return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) { diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h index 853585dac9c..1d39411450e 100644 --- a/src/crimson/os/alienstore/alien_store.h +++ b/src/crimson/os/alienstore/alien_store.h @@ -36,7 +36,8 @@ public: base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; read_errorator::future<ceph::bufferlist> read(CollectionRef c, const ghobject_t& oid, @@ -49,29 +50,36 @@ public: uint32_t op_flags = 0) final; - get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c, - const ghobject_t& oid, - std::string_view name) const final; - get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c, - const ghobject_t& oid) final; + get_attr_errorator::future<ceph::bufferlist> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name, + uint32_t op_flags = 0) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; /// Retrieves paged set of values > start (if present) read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; ///< @return <done, values> values.empty() iff done seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; seastar::future<CollectionRef> open_collection(const coll_t& cid) final; @@ -97,16 +105,19 @@ public: unsigned get_max_attr_name_length() const final; seastar::future<struct stat> stat( CollectionRef, - const ghobject_t&) final; + const ghobject_t&, + uint32_t op_flags = 0) final; seastar::future<std::string> get_default_device_class() final; get_attr_errorator::future<ceph::bufferlist> omap_get_header( CollectionRef, - const ghobject_t&) final; + const ghobject_t&, + uint32_t) final; read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( CollectionRef, const ghobject_t&, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags) final; FuturizedStore::Shard& get_sharded_store() final { return *this; diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc index a8bf514de15..41819fb5eb6 100644 --- a/src/crimson/os/cyanstore/cyan_store.cc +++ b/src/crimson/os/cyanstore/cyan_store.cc @@ -208,7 +208,8 @@ CyanStore::Shard::list_objects( CollectionRef ch, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const + uint64_t limit, + uint32_t op_flags) const { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {} {} {}", @@ -257,7 +258,8 @@ CyanStore::Shard::list_collections() CyanStore::Shard::base_errorator::future<bool> CyanStore::Shard::exists( CollectionRef ch, - const ghobject_t &oid) + const ghobject_t &oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); if (!c->exists) { @@ -333,7 +335,8 @@ CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> CyanStore::Shard::get_attr( CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {}", @@ -352,7 +355,8 @@ CyanStore::Shard::get_attr( CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t> CyanStore::Shard::get_attrs( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {}", @@ -367,7 +371,8 @@ CyanStore::Shard::get_attrs( auto CyanStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t& oid, - const omap_keys_t& keys) + const omap_keys_t& keys, + uint32_t op_flags) -> read_errorator::future<omap_values_t> { auto c = static_cast<Collection*>(ch.get()); @@ -388,7 +393,8 @@ auto CyanStore::Shard::omap_get_values( auto CyanStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional<string> &start) + const std::optional<string> &start, + uint32_t op_flags) -> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>> { auto c = static_cast<Collection*>(ch.get()); @@ -409,7 +415,8 @@ auto CyanStore::Shard::omap_get_values( auto CyanStore::Shard::omap_get_header( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) -> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> { auto c = static_cast<Collection*>(ch.get()); @@ -977,7 +984,8 @@ CyanStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); @@ -992,7 +1000,8 @@ CyanStore::Shard::fiemap( seastar::future<struct stat> CyanStore::Shard::stat( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); auto o = c->get_object(oid); diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h index e9394991bc2..1d481ef5829 100644 --- a/src/crimson/os/cyanstore/cyan_store.h +++ b/src/crimson/os/cyanstore/cyan_store.h @@ -34,11 +34,13 @@ public: seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; base_errorator::future<bool> exists( CollectionRef ch, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<ceph::bufferlist> read( CollectionRef c, @@ -56,33 +58,39 @@ public: get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const final; + std::string_view name, + uint32_t op_flags = 0) const final; get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; get_attr_errorator::future<ceph::bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; @@ -101,7 +109,8 @@ public: CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags) final; unsigned get_max_attr_name_length() const final; diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h index 51ef2331014..e7d4c8546de 100644 --- a/src/crimson/os/futurized_store.h +++ b/src/crimson/os/futurized_store.h @@ -54,7 +54,8 @@ public: virtual base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; using get_attr_errorator = crimson::errorator< crimson::ct_error::enoent, @@ -62,42 +63,49 @@ public: virtual get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const = 0; + std::string_view name, + uint32_t op_flags = 0) const = 0; using get_attrs_ertr = crimson::errorator< crimson::ct_error::enoent>; using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>; virtual get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; virtual seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; using omap_values_t = attrs_t; using omap_keys_t = std::set<std::string>; virtual read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) = 0; + const omap_keys_t& keys, + uint32_t op_flags = 0) = 0; using omap_values_paged_t = std::tuple<bool, omap_values_t>; virtual read_errorator::future<omap_values_paged_t> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) = 0; ///< @return <done, values> values.empty() only if done virtual get_attr_errorator::future<bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const = 0; + uint64_t limit, + uint32_t op_flags = 0) const = 0; virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0; @@ -153,7 +161,8 @@ public: CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) = 0; + uint64_t len, + uint32_t op_flags = 0) = 0; virtual unsigned get_max_attr_name_length() const = 0; }; diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index e5b8960c38c..3da5e65ceec 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -1,5 +1,6 @@ set(crimson_seastore_srcs cached_extent.cc + lba_mapping.cc seastore_types.cc segment_manager.cc segment_manager/ephemeral.cc @@ -19,7 +20,6 @@ set(crimson_seastore_srcs omap_manager.cc omap_manager/btree/btree_omap_manager.cc omap_manager/btree/omap_btree_node_impl.cc - btree/btree_range_pin.cc btree/fixed_kv_node.cc onode.cc onode_manager/staged-fltree/node.cc diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc index cecdb985532..64e6749562e 100644 --- a/src/crimson/os/seastore/async_cleaner.cc +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -609,6 +609,7 @@ JournalTrimmerImpl::trim_alloc() return extent_callback->with_transaction_intr( Transaction::src_t::TRIM_ALLOC, "trim_alloc", + CACHE_HINT_NOCACHE, [this, FNAME](auto &t) { auto target = get_alloc_tail_target(); @@ -653,6 +654,7 @@ JournalTrimmerImpl::trim_dirty() return extent_callback->with_transaction_intr( Transaction::src_t::TRIM_DIRTY, "trim_dirty", + CACHE_HINT_NOCACHE, [this, FNAME](auto &t) { auto target = get_dirty_tail_target(); @@ -1125,6 +1127,7 @@ SegmentCleaner::do_reclaim_space( return extent_callback->with_transaction_intr( src, "clean_reclaim_space", + CACHE_HINT_NOCACHE, [this, &backref_extents, &pin_list, &reclaimed](auto &t) { return seastar::do_with( @@ -1142,8 +1145,7 @@ SegmentCleaner::do_reclaim_space( pin->get_key(), pin->get_val(), pin->get_length(), - pin->get_type(), - JOURNAL_SEQ_NULL); + pin->get_type()); } for (auto &cached_backref : cached_backref_entries) { if (cached_backref.laddr == L_ADDR_NULL) { @@ -1241,6 +1243,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space() return extent_callback->with_transaction_intr( Transaction::src_t::READ, "retrieve_from_backref_tree", + CACHE_HINT_NOCACHE, [this, &weak_read_ret](auto &t) { return backref_manager.get_mappings( t, @@ -1507,6 +1510,7 @@ bool SegmentCleaner::check_usage() SpaceTrackerIRef tracker(space_tracker->make_empty()); extent_callback->with_transaction_weak( "check_usage", + CACHE_HINT_NOCACHE, [this, &tracker](auto &t) { return backref_manager.scan_mapped_space( t, @@ -1813,6 +1817,7 @@ bool RBMCleaner::check_usage() RBMSpaceTracker tracker(rbms); extent_callback->with_transaction_weak( "check_usage", + CACHE_HINT_NOCACHE, [this, &tracker, &rbms](auto &t) { return backref_manager.scan_mapped_space( t, diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h index 424247c5bdc..1cef771aeb8 100644 --- a/src/crimson/os/seastore/async_cleaner.h +++ b/src/crimson/os/seastore/async_cleaner.h @@ -17,6 +17,7 @@ #include "crimson/os/seastore/randomblock_manager_group.h" #include "crimson/os/seastore/transaction.h" #include "crimson/os/seastore/segment_seq_allocator.h" +#include "crimson/os/seastore/backref_mapping.h" namespace crimson::os::seastore { @@ -299,24 +300,29 @@ public: /// Creates empty transaction /// weak transaction should be type READ virtual TransactionRef create_transaction( - Transaction::src_t, const char *name, bool is_weak=false) = 0; + Transaction::src_t, + const char *name, + cache_hint_t cache_hint = CACHE_HINT_TOUCH, + bool is_weak=false) = 0; /// Creates empty transaction with interruptible context template <typename Func> auto with_transaction_intr( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, Func &&f) { return do_with_transaction_intr<Func, false>( - src, name, std::forward<Func>(f)); + src, name, cache_hint, std::forward<Func>(f)); } template <typename Func> auto with_transaction_weak( const char* name, + cache_hint_t cache_hint, Func &&f) { return do_with_transaction_intr<Func, true>( - Transaction::src_t::READ, name, std::forward<Func>(f) + Transaction::src_t::READ, name, cache_hint, std::forward<Func>(f) ).handle_error( crimson::ct_error::eagain::assert_failure{"unexpected eagain"}, crimson::ct_error::pass_further_all{} @@ -385,9 +391,10 @@ private: auto do_with_transaction_intr( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, Func &&f) { return seastar::do_with( - create_transaction(src, name, IsWeak), + create_transaction(src, name, cache_hint, IsWeak), [f=std::forward<Func>(f)](auto &ref_t) mutable { return with_trans_intr( *ref_t, diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 38084bb00e6..24897dd55da 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -9,44 +9,28 @@ namespace crimson::os::seastore::backref { -constexpr size_t BACKREF_BLOCK_SIZE = 4096; - -class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> { - extent_types_t type; +class BtreeBackrefMapping : public BackrefMapping { public: BtreeBackrefMapping(op_context_t<paddr_t> ctx) - : BtreeNodeMapping(ctx) {} + : BackrefMapping(ctx) {} BtreeBackrefMapping( op_context_t<paddr_t> ctx, CachedExtentRef parent, uint16_t pos, backref_map_val_t &val, backref_node_meta_t &&meta) - : BtreeNodeMapping( + : BackrefMapping( + val.type, ctx, parent, pos, val.laddr, val.len, - std::forward<backref_node_meta_t>(meta)), - type(val.type) - {} - extent_types_t get_type() const final { - return type; - } - - bool is_clone() const final { - return false; - } - -protected: - std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate( - op_context_t<paddr_t> ctx) const final { - return std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>>( - new BtreeBackrefMapping(ctx)); - } + std::forward<backref_node_meta_t>(meta)) {} }; +constexpr size_t BACKREF_BLOCK_SIZE = 4096; + using BackrefBtree = FixedKVBtree< paddr_t, backref_map_val_t, BackrefInternalNode, BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>; diff --git a/src/crimson/os/seastore/backref_entry.h b/src/crimson/os/seastore/backref_entry.h new file mode 100644 index 00000000000..5f9becc9565 --- /dev/null +++ b/src/crimson/os/seastore/backref_entry.h @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <iostream> + +#if FMT_VERSION >= 90000 +#include <fmt/ostream.h> +#endif + +#include <boost/intrusive/set.hpp> + +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +struct backref_entry_t { + using ref_t = std::unique_ptr<backref_entry_t>; + + backref_entry_t( + const paddr_t& paddr, + const laddr_t& laddr, + extent_len_t len, + extent_types_t type) + : paddr(paddr), + laddr(laddr), + len(len), + type(type) { + assert(len > 0); + } + paddr_t paddr = P_ADDR_NULL; + laddr_t laddr = L_ADDR_NULL; + extent_len_t len = 0; + extent_types_t type = extent_types_t::NONE; + friend bool operator< ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr < r.paddr; + } + friend bool operator> ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr > r.paddr; + } + friend bool operator== ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr == r.paddr; + } + + using set_hook_t = + boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + set_hook_t backref_set_hook; + using backref_set_member_options = boost::intrusive::member_hook< + backref_entry_t, + set_hook_t, + &backref_entry_t::backref_set_hook>; + using multiset_t = boost::intrusive::multiset< + backref_entry_t, + backref_set_member_options, + boost::intrusive::constant_time_size<false>>; + + struct cmp_t { + using is_transparent = paddr_t; + bool operator()( + const backref_entry_t &l, + const backref_entry_t &r) const { + return l.paddr < r.paddr; + } + bool operator()(const paddr_t l, const backref_entry_t &r) const { + return l < r.paddr; + } + bool operator()(const backref_entry_t &l, const paddr_t r) const { + return l.paddr < r; + } + }; + + static ref_t create_alloc( + const paddr_t& paddr, + const laddr_t& laddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type)); + assert(laddr != L_ADDR_NULL); + return std::make_unique<backref_entry_t>( + paddr, laddr, len, type); + } + + static ref_t create_retire( + const paddr_t& paddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type) || + is_retired_placeholder_type(type)); + return std::make_unique<backref_entry_t>( + paddr, L_ADDR_NULL, len, type); + } + + static ref_t create(const alloc_blk_t& delta) { + return std::make_unique<backref_entry_t>( + delta.paddr, delta.laddr, delta.len, delta.type); + } +}; + +inline std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) { + return out << "backref_entry_t{" + << ent.paddr << "~0x" << std::hex << ent.len << std::dec << ", " + << "laddr: " << ent.laddr << ", " + << "type: " << ent.type + << "}"; +} + +using backref_entry_ref = backref_entry_t::ref_t; +using backref_entry_mset_t = backref_entry_t::multiset_t; +using backref_entry_refs_t = std::vector<backref_entry_ref>; +using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>; +using backref_entry_query_set_t = std::set<backref_entry_t, backref_entry_t::cmp_t>; + +} // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::backref_entry_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h index 3feedb997b4..8c746b571b2 100644 --- a/src/crimson/os/seastore/backref_manager.h +++ b/src/crimson/os/seastore/backref_manager.h @@ -6,6 +6,7 @@ #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/backref_mapping.h" namespace crimson::os::seastore { diff --git a/src/crimson/os/seastore/backref_mapping.h b/src/crimson/os/seastore/backref_mapping.h new file mode 100644 index 00000000000..d0a6a0ea6ff --- /dev/null +++ b/src/crimson/os/seastore/backref_mapping.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/btree/btree_range_pin.h" + +namespace crimson::os::seastore { + +class BackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> { + extent_types_t type; +public: + BackrefMapping(op_context_t<paddr_t> ctx) + : BtreeNodeMapping(ctx) {} + template <typename... T> + BackrefMapping(extent_types_t type, T&&... t) + : BtreeNodeMapping(std::forward<T>(t)...), + type(type) {} + extent_types_t get_type() const { + return type; + } +}; + +using BackrefMappingRef = std::unique_ptr<BackrefMapping>; +using backref_pin_list_t = std::list<BackrefMappingRef>; + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc deleted file mode 100644 index f0d507a24c4..00000000000 --- a/src/crimson/os/seastore/btree/btree_range_pin.cc +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "crimson/os/seastore/btree/btree_range_pin.h" -#include "crimson/os/seastore/btree/fixed_kv_node.h" - -namespace crimson::os::seastore { - -template <typename key_t, typename val_t> -get_child_ret_t<LogicalCachedExtent> -BtreeNodeMapping<key_t, val_t>::get_logical_extent( - Transaction &t) -{ - ceph_assert(is_parent_viewable()); - assert(pos != std::numeric_limits<uint16_t>::max()); - ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id()); - auto &p = (FixedKVNode<key_t>&)*parent; - auto k = this->is_indirect() - ? this->get_intermediate_base() - : get_key(); - auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k); - if (!v.has_child()) { - this->child_pos = v.get_child_pos(); - } - return v; -} - -template <typename key_t, typename val_t> -bool BtreeNodeMapping<key_t, val_t>::is_stable() const -{ - assert(!this->parent_modified()); - assert(pos != std::numeric_limits<uint16_t>::max()); - auto &p = (FixedKVNode<key_t>&)*parent; - auto k = this->is_indirect() - ? this->get_intermediate_base() - : get_key(); - return p.is_child_stable(ctx, pos, k); -} - -template <typename key_t, typename val_t> -bool BtreeNodeMapping<key_t, val_t>::is_data_stable() const -{ - assert(!this->parent_modified()); - assert(pos != std::numeric_limits<uint16_t>::max()); - auto &p = (FixedKVNode<key_t>&)*parent; - auto k = this->is_indirect() - ? this->get_intermediate_base() - : get_key(); - return p.is_child_data_stable(ctx, pos, k); -} - -template class BtreeNodeMapping<laddr_t, paddr_t>; -template class BtreeNodeMapping<paddr_t, laddr_t>; -} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index 91751801e5d..bfd350a8bed 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -7,11 +7,12 @@ #include "crimson/common/log.h" -#include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" namespace crimson::os::seastore { +class Cache; template <typename node_key_t> struct op_context_t { @@ -116,8 +117,6 @@ protected: extent_len_t len = 0; fixed_kv_node_meta_t<key_t> range; uint16_t pos = std::numeric_limits<uint16_t>::max(); - - virtual std::unique_ptr<BtreeNodeMapping> _duplicate(op_context_t<key_t>) const = 0; fixed_kv_node_meta_t<key_t> _get_pin_range() const { return range; } @@ -139,11 +138,7 @@ public: len(len), range(meta), pos(pos) - { - if (!parent->is_pending()) { - this->child_pos = {parent, pos}; - } - } + {} CachedExtentRef get_parent() const final { return parent; @@ -162,11 +157,6 @@ public: return len; } - extent_types_t get_type() const override { - ceph_abort("should never happen"); - return extent_types_t::ROOT; - } - val_t get_val() const final { if constexpr (std::is_same_v<val_t, paddr_t>) { return value.get_paddr(); @@ -180,16 +170,6 @@ public: return range.begin; } - PhysicalNodeMappingRef<key_t, val_t> duplicate() const final { - auto ret = _duplicate(ctx); - ret->range = range; - ret->value = value; - ret->parent = parent; - ret->len = len; - ret->pos = pos; - return ret; - } - bool has_been_invalidated() const final { return parent->has_been_invalidated(); } @@ -215,9 +195,6 @@ public: return unviewable; } - get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final; - bool is_stable() const final; - bool is_data_stable() const final; bool is_parent_viewable() const final { ceph_assert(parent); if (!parent->is_valid()) { diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index cdd9c542d95..86f816e1648 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -28,15 +28,6 @@ SET_SUBSYS(seastore_cache); namespace crimson::os::seastore { -std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) { - return out << "backref_entry_t{" - << ent.paddr << "~0x" << std::hex << ent.len << std::dec << ", " - << "laddr: " << ent.laddr << ", " - << "type: " << ent.type << ", " - << "seq: " << ent.seq << ", " - << "}"; -} - Cache::Cache( ExtentPlacementManager &epm) : epm(epm), @@ -1348,21 +1339,39 @@ record_t Cache::prepare_record( io_stat_t retire_stat; std::vector<alloc_delta_t> alloc_deltas; alloc_delta_t rel_delta; + backref_entry_refs_t backref_entries; rel_delta.op = alloc_delta_t::op_types_t::CLEAR; for (auto &i: t.retired_set) { auto &extent = i.extent; get_by_ext(efforts.retire_by_ext, extent->get_type()).increment(extent->get_length()); retire_stat.increment(extent->get_length()); - DEBUGT("retired and remove extent -- {}", t, *extent); + DEBUGT("retired and remove extent {}~0x{:x} -- {}", + t, extent->get_paddr(), extent->get_length(), *extent); commit_retire_extent(t, extent); - if (is_backref_mapped_extent_node(extent) || - is_retired_placeholder_type(extent->get_type())) { + + // Note: commit extents and backref allocations in the same place + if (is_backref_mapped_type(extent->get_type()) || + is_retired_placeholder_type(extent->get_type())) { + DEBUGT("backref_entry free {}~0x{:x}", + t, + extent->get_paddr(), + extent->get_length()); rel_delta.alloc_blk_ranges.emplace_back( - extent->get_paddr(), - L_ADDR_NULL, - extent->get_length(), - extent->get_type()); + alloc_blk_t::create_retire( + extent->get_paddr(), + extent->get_length(), + extent->get_type())); + backref_entries.emplace_back( + backref_entry_t::create_retire( + extent->get_paddr(), + extent->get_length(), + extent->get_type())); + } else if (is_backref_node(extent->get_type())) { + remove_backref_extent(extent->get_paddr()); + } else { + ERRORT("Got unexpected extent type: {}", t, *extent); + ceph_abort("imposible"); } } alloc_deltas.emplace_back(std::move(rel_delta)); @@ -1399,27 +1408,40 @@ record_t Cache::prepare_record( if (modify_time == NULL_TIME) { modify_time = commit_time; } + laddr_t fresh_laddr; + if (i->is_logical()) { + fresh_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else if (is_lba_node(i->get_type())) { + fresh_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } else { + fresh_laddr = L_ADDR_NULL; + } record.push_back(extent_t{ i->get_type(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : (is_lba_node(i->get_type()) - ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin - : L_ADDR_NULL), + fresh_laddr, std::move(bl) }, modify_time); - if (i->is_valid() - && is_backref_mapped_extent_node(i)) { + + if (!i->is_valid()) { + continue; + } + if (is_backref_mapped_type(i->get_type())) { + laddr_t alloc_laddr; + if (i->is_logical()) { + alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else if (is_lba_node(i->get_type())) { + alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } else { + assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL); + alloc_laddr = L_ADDR_MIN; + } alloc_delta.alloc_blk_ranges.emplace_back( - i->get_paddr(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : (is_lba_node(i->get_type()) - ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin - : L_ADDR_NULL), - i->get_length(), - i->get_type()); + alloc_blk_t::create_alloc( + i->get_paddr(), + alloc_laddr, + i->get_length(), + i->get_type())); } } @@ -1430,14 +1452,20 @@ record_t Cache::prepare_record( get_by_ext(efforts.fresh_ool_by_ext, i->get_type()).increment(i->get_length()); i->prepare_commit(); - if (is_backref_mapped_extent_node(i)) { + if (is_backref_mapped_type(i->get_type())) { + laddr_t alloc_laddr; + if (i->is_logical()) { + alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else { + assert(is_lba_node(i->get_type())); + alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } alloc_delta.alloc_blk_ranges.emplace_back( - i->get_paddr(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin, - i->get_length(), - i->get_type()); + alloc_blk_t::create_alloc( + i->get_paddr(), + alloc_laddr, + i->get_length(), + i->get_type())); } } @@ -1455,19 +1483,57 @@ record_t Cache::prepare_record( i->state = CachedExtent::extent_state_t::CLEAN; assert(i->is_logical()); i->clear_modified_region(); - touch_extent(*i, &trans_src); + touch_extent(*i, &trans_src, t.get_cache_hint()); DEBUGT("inplace rewrite ool block is commmitted -- {}", t, *i); } + auto existing_stats = t.get_existing_block_stats(); + DEBUGT("total existing blocks num: {}, exist clean num: {}, " + "exist mutation pending num: {}", + t, + existing_stats.valid_num, + existing_stats.clean_num, + existing_stats.mutated_num); for (auto &i: t.existing_block_list) { - if (i->is_valid()) { - alloc_delta.alloc_blk_ranges.emplace_back( - i->get_paddr(), + assert(is_logical_type(i->get_type())); + if (!i->is_valid()) { + continue; + } + + if (i->is_exist_clean()) { + i->state = CachedExtent::extent_state_t::CLEAN; + } else { + assert(i->is_exist_mutation_pending()); + // i->state must become DIRTY in complete_commit() + } + + // exist mutation pending extents must be in t.mutated_block_list + add_extent(i); + const auto t_src = t.get_src(); + if (i->is_dirty()) { + add_to_dirty(i, &t_src); + } else { + touch_extent(*i, &t_src, t.get_cache_hint()); + } + + alloc_delta.alloc_blk_ranges.emplace_back( + alloc_blk_t::create_alloc( + i->get_paddr(), i->cast<LogicalCachedExtent>()->get_laddr(), i->get_length(), - i->get_type()); - } + i->get_type())); + + // Note: commit extents and backref allocations in the same place + // Note: remapping is split into 2 steps, retire and alloc, they must be + // committed atomically together + backref_entries.emplace_back( + backref_entry_t::create_alloc( + i->get_paddr(), + i->cast<LogicalCachedExtent>()->get_laddr(), + i->get_length(), + i->get_type())); } + alloc_deltas.emplace_back(std::move(alloc_delta)); for (auto b : alloc_deltas) { @@ -1521,6 +1587,9 @@ record_t Cache::prepare_record( record.push_back(std::move(delta)); } + apply_backref_mset(backref_entries); + t.set_backref_entries(std::move(backref_entries)); + ceph_assert(t.get_fresh_block_stats().num == t.inline_block_list.size() + t.ool_block_list.size() + @@ -1620,26 +1689,35 @@ record_t Cache::prepare_record( return record; } -void Cache::backref_batch_update( - std::vector<backref_entry_ref> &&list, - const journal_seq_t &seq) +void Cache::apply_backref_byseq( + backref_entry_refs_t&& backref_entries, + const journal_seq_t& seq) { - LOG_PREFIX(Cache::backref_batch_update); - DEBUG("inserting {} entries at {}", list.size(), seq); - ceph_assert(seq != JOURNAL_SEQ_NULL); - - for (auto &ent : list) { - backref_entry_mset.insert(*ent); + LOG_PREFIX(Cache::apply_backref_byseq); + DEBUG("backref_entry apply {} entries at {}", + backref_entries.size(), seq); + assert(seq != JOURNAL_SEQ_NULL); + if (backref_entries.empty()) { + return; } - - auto iter = backref_entryrefs_by_seq.find(seq); - if (iter == backref_entryrefs_by_seq.end()) { - backref_entryrefs_by_seq.emplace(seq, std::move(list)); + if (backref_entryrefs_by_seq.empty()) { + backref_entryrefs_by_seq.insert( + backref_entryrefs_by_seq.end(), + {seq, std::move(backref_entries)}); + return; + } + auto last = backref_entryrefs_by_seq.rbegin(); + assert(last->first <= seq); + if (last->first == seq) { + last->second.insert( + last->second.end(), + std::make_move_iterator(backref_entries.begin()), + std::make_move_iterator(backref_entries.end())); } else { - iter->second.insert( - iter->second.end(), - std::make_move_iterator(list.begin()), - std::make_move_iterator(list.end())); + assert(last->first < seq); + backref_entryrefs_by_seq.insert( + backref_entryrefs_by_seq.end(), + {seq, std::move(backref_entries)}); } } @@ -1652,7 +1730,7 @@ void Cache::complete_commit( SUBTRACET(seastore_t, "final_block_start={}, start_seq={}", t, final_block_start, start_seq); - std::vector<backref_entry_ref> backref_list; + backref_entry_refs_t backref_entries; t.for_each_finalized_fresh_block([&](const CachedExtentRef &i) { if (!i->is_valid()) { return; @@ -1681,24 +1759,30 @@ void Cache::complete_commit( add_extent(i); assert(!i->is_dirty()); const auto t_src = t.get_src(); - touch_extent(*i, &t_src); + touch_extent(*i, &t_src, t.get_cache_hint()); epm.commit_space_used(i->get_paddr(), i->get_length()); - if (is_backref_mapped_extent_node(i)) { - DEBUGT("backref_list new {} len 0x{:x}", + + // Note: commit extents and backref allocations in the same place + if (is_backref_mapped_type(i->get_type())) { + DEBUGT("backref_entry alloc {}~0x{:x}", t, i->get_paddr(), i->get_length()); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( + laddr_t alloc_laddr; + if (i->is_logical()) { + alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else if (is_lba_node(i->get_type())) { + alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } else { + assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL); + alloc_laddr = L_ADDR_MIN; + } + backref_entries.emplace_back( + backref_entry_t::create_alloc( i->get_paddr(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : (is_lba_node(i->get_type()) - ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin - : L_ADDR_NULL), + alloc_laddr, i->get_length(), - i->get_type(), - start_seq)); + i->get_type())); } else if (is_backref_node(i->get_type())) { add_backref_extent( i->get_paddr(), @@ -1735,9 +1819,10 @@ void Cache::complete_commit( epm.mark_space_free(extent->get_paddr(), extent->get_length()); } for (auto &i: t.existing_block_list) { - if (i->is_valid()) { - epm.mark_space_used(i->get_paddr(), i->get_length()); + if (!i->is_valid()) { + continue; } + epm.mark_space_used(i->get_paddr(), i->get_length()); } for (auto &i: t.mutated_block_list) { @@ -1751,64 +1836,10 @@ void Cache::complete_commit( for (auto &i: t.retired_set) { auto &extent = i.extent; extent->dirty_from_or_retired_at = start_seq; - if (is_backref_mapped_extent_node(extent) || - is_retired_placeholder_type(extent->get_type())) { - DEBUGT("backref_list free {} len 0x{:x}", - t, - extent->get_paddr(), - extent->get_length()); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( - extent->get_paddr(), - L_ADDR_NULL, - extent->get_length(), - extent->get_type(), - start_seq)); - } else if (is_backref_node(extent->get_type())) { - remove_backref_extent(extent->get_paddr()); - } else { - ERRORT("{}", t, *extent); - ceph_abort("not possible"); - } } - auto existing_stats = t.get_existing_block_stats(); - DEBUGT("total existing blocks num: {}, exist clean num: {}, " - "exist mutation pending num: {}", - t, - existing_stats.valid_num, - existing_stats.clean_num, - existing_stats.mutated_num); - for (auto &i: t.existing_block_list) { - if (i->is_valid()) { - if (i->is_exist_clean()) { - i->state = CachedExtent::extent_state_t::CLEAN; - } else { - assert(i->state == CachedExtent::extent_state_t::DIRTY); - } - DEBUGT("backref_list new existing {} len 0x{:x}", - t, - i->get_paddr(), - i->get_length()); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( - i->get_paddr(), - i->cast<LogicalCachedExtent>()->get_laddr(), - i->get_length(), - i->get_type(), - start_seq)); - add_extent(i); - const auto t_src = t.get_src(); - if (i->is_dirty()) { - add_to_dirty(i, &t_src); - } else { - touch_extent(*i, &t_src); - } - } - } - if (!backref_list.empty()) { - backref_batch_update(std::move(backref_list), start_seq); - } + apply_backref_byseq(t.move_backref_entries(), start_seq); + commit_backref_entries(std::move(backref_entries), start_seq); for (auto &i: t.pre_alloc_list) { if (!i->is_valid()) { @@ -1931,7 +1962,7 @@ Cache::replay_delta( alloc_delta_t alloc_delta; decode(alloc_delta, delta.bl); - std::vector<backref_entry_ref> backref_list; + backref_entry_refs_t backref_entries; for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) { if (alloc_blk.paddr.is_relative()) { assert(alloc_blk.paddr.is_record_relative()); @@ -1939,17 +1970,10 @@ Cache::replay_delta( } DEBUG("replay alloc_blk {}~0x{:x} {}, journal_seq: {}", alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( - alloc_blk.paddr, - alloc_blk.laddr, - alloc_blk.len, - alloc_blk.type, - journal_seq)); - } - if (!backref_list.empty()) { - backref_batch_update(std::move(backref_list), journal_seq); + backref_entries.emplace_back( + backref_entry_t::create(alloc_blk)); } + commit_backref_entries(std::move(backref_entries), journal_seq); return replay_delta_ertr::make_ready_future<std::pair<bool, CachedExtentRef>>( std::make_pair(true, nullptr)); } @@ -2002,7 +2026,7 @@ Cache::replay_delta( [](CachedExtent &) {}, [this](CachedExtent &ext) { // replay is not included by the cache hit metrics - touch_extent(ext, nullptr); + touch_extent(ext, nullptr, CACHE_HINT_TOUCH); }, nullptr) : _get_extent_if_cached( diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 07647f6c7cf..a239b861726 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -3,14 +3,13 @@ #pragma once -#include <iostream> - #include "seastar/core/shared_future.hh" #include "include/buffer.h" #include "crimson/common/errorator.h" #include "crimson/common/errorator-loop.h" +#include "crimson/os/seastore/backref_entry.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/extent_placement_manager.h" #include "crimson/os/seastore/logging.h" @@ -38,86 +37,6 @@ class FixedKVBtree; class BackrefManager; class SegmentProvider; -struct backref_entry_t { - backref_entry_t( - const paddr_t paddr, - const laddr_t laddr, - const extent_len_t len, - const extent_types_t type, - const journal_seq_t seq) - : paddr(paddr), - laddr(laddr), - len(len), - type(type), - seq(seq) - {} - backref_entry_t(alloc_blk_t alloc_blk) - : paddr(alloc_blk.paddr), - laddr(alloc_blk.laddr), - len(alloc_blk.len), - type(alloc_blk.type) - {} - paddr_t paddr = P_ADDR_NULL; - laddr_t laddr = L_ADDR_NULL; - extent_len_t len = 0; - extent_types_t type = - extent_types_t::ROOT; - journal_seq_t seq; - friend bool operator< ( - const backref_entry_t &l, - const backref_entry_t &r) { - return l.paddr < r.paddr; - } - friend bool operator> ( - const backref_entry_t &l, - const backref_entry_t &r) { - return l.paddr > r.paddr; - } - friend bool operator== ( - const backref_entry_t &l, - const backref_entry_t &r) { - return l.paddr == r.paddr; - } - - using set_hook_t = - boost::intrusive::set_member_hook< - boost::intrusive::link_mode< - boost::intrusive::auto_unlink>>; - set_hook_t backref_set_hook; - using backref_set_member_options = boost::intrusive::member_hook< - backref_entry_t, - set_hook_t, - &backref_entry_t::backref_set_hook>; - using multiset_t = boost::intrusive::multiset< - backref_entry_t, - backref_set_member_options, - boost::intrusive::constant_time_size<false>>; - - struct cmp_t { - using is_transparent = paddr_t; - bool operator()( - const backref_entry_t &l, - const backref_entry_t &r) const { - return l.paddr < r.paddr; - } - bool operator()(const paddr_t l, const backref_entry_t &r) const { - return l < r.paddr; - } - bool operator()(const backref_entry_t &l, const paddr_t r) const { - return l.paddr < r; - } - }; -}; - -std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent); - -using backref_entry_ref = std::unique_ptr<backref_entry_t>; -using backref_entry_mset_t = backref_entry_t::multiset_t; -using backref_entry_refs_t = std::vector<backref_entry_ref>; -using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>; -using backref_entry_query_set_t = std::set< - backref_entry_t, backref_entry_t::cmp_t>; - /** * Cache * @@ -205,6 +124,7 @@ public: TransactionRef create_transaction( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, bool is_weak) { LOG_PREFIX(Cache::create_transaction); @@ -218,7 +138,8 @@ public: [this](Transaction& t) { return on_transaction_destruct(t); }, - ++next_id + ++next_id, + cache_hint ); SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}", *ret, name, src, is_weak); @@ -365,7 +286,7 @@ public: SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}", t, type, offset, *ret); t.add_to_read_set(ret); - touch_extent(*ret, &t_src); + touch_extent(*ret, &t_src, t.get_cache_hint()); return ret->wait_io().then([ret] { return get_extent_if_cached_iertr::make_ready_future< CachedExtentRef>(ret); @@ -422,7 +343,7 @@ public: t, T::TYPE, offset, length); auto f = [&t, this, t_src](CachedExtent &ext) { t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent<T>( @@ -470,7 +391,7 @@ public: ++stats.access.s.load_absent; t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent<T>( @@ -568,7 +489,7 @@ public: ++access_stats.cache_lru; ++stats.access.s.cache_lru; } - touch_extent(*p_extent, &t_src); + touch_extent(*p_extent, &t_src, t.get_cache_hint()); } else { if (p_extent->is_dirty()) { ++access_stats.trans_dirty; @@ -915,7 +836,7 @@ private: t, type, offset, length, laddr); auto f = [&t, this, t_src](CachedExtent &ext) { t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent_by_type( @@ -957,7 +878,7 @@ private: ++stats.access.s.load_absent; t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent_by_type( @@ -984,7 +905,7 @@ private: for (auto it = start_iter; it != end_iter; it++) { - res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq); + res.emplace(it->paddr, it->laddr, it->len, it->type); } return res; } @@ -1553,11 +1474,10 @@ private: /// Update lru for access to ref void touch_extent( CachedExtent &ext, - const Transaction::src_t* p_src) + const Transaction::src_t* p_src, + cache_hint_t hint) { - if (p_src && - is_background_transaction(*p_src) && - is_logical_type(ext.get_type())) { + if (hint == CACHE_HINT_NOCACHE && is_logical_type(ext.get_type())) { return; } if (ext.is_stable_clean() && !ext.is_placeholder()) { @@ -1907,9 +1827,23 @@ private: seastar::metrics::metric_group metrics; void register_metrics(); - void backref_batch_update( - std::vector<backref_entry_ref> &&, - const journal_seq_t &); + void apply_backref_mset( + backref_entry_refs_t& backref_entries) { + for (auto& entry : backref_entries) { + backref_entry_mset.insert(*entry); + } + } + + void apply_backref_byseq( + backref_entry_refs_t&& backref_entries, + const journal_seq_t& seq); + + void commit_backref_entries( + backref_entry_refs_t&& backref_entries, + const journal_seq_t& seq) { + apply_backref_mset(backref_entries); + apply_backref_byseq(std::move(backref_entries), seq); + } /// Add extent to extents handling dirty and refcounting /// diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 085a519cb68..49fede1d9a8 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -7,6 +7,7 @@ #include "crimson/common/log.h" #include "crimson/os/seastore/btree/fixed_kv_node.h" +#include "crimson/os/seastore/lba_mapping.h" namespace { [[maybe_unused]] seastar::logger& logger() { @@ -38,12 +39,6 @@ void intrusive_ptr_release(CachedExtent *ptr) #endif -bool is_backref_mapped_extent_node(const CachedExtentRef &extent) { - return extent->is_logical() - || is_lba_node(extent->get_type()) - || extent->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL; -} - std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state) { switch (state) { @@ -148,6 +143,12 @@ void LogicalCachedExtent::on_replace_prior() { parent->children[off] = this; } +void LogicalCachedExtent::maybe_set_intermediate_laddr(LBAMapping &mapping) { + laddr = mapping.is_indirect() + ? mapping.get_intermediate_base() + : mapping.get_key(); +} + parent_tracker_t::~parent_tracker_t() { // this is parent's tracker, reset it auto &p = (FixedKVNode<laddr_t>&)*parent; @@ -156,32 +157,6 @@ parent_tracker_t::~parent_tracker_t() { } } -std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) -{ - out << "LBAMapping(" << rhs.get_key() - << "~0x" << std::hex << rhs.get_length() << std::dec - << "->" << rhs.get_val(); - if (rhs.is_indirect()) { - out << ",indirect(" << rhs.get_intermediate_base() - << "~0x" << std::hex << rhs.get_intermediate_length() - << "@0x" << rhs.get_intermediate_offset() << std::dec - << ")"; - } - out << ")"; - return out; -} - -std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) -{ - bool first = true; - out << '['; - for (const auto &i: rhs) { - out << (first ? "" : ",") << *i; - first = false; - } - return out << ']'; -} - bool BufferSpace::is_range_loaded(extent_len_t offset, extent_len_t length) const { assert(length > 0); diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 782afa19d33..9dc60d719eb 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -1097,8 +1097,6 @@ protected: std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); std::ostream &operator<<(std::ostream &, const CachedExtent&); -bool is_backref_mapped_extent_node(const CachedExtentRef &extent); - /// Compare extents by paddr struct paddr_cmp { bool operator()(paddr_t lhs, const CachedExtent &rhs) const { @@ -1281,7 +1279,6 @@ private: }; class ChildableCachedExtent; -class LogicalCachedExtent; class child_pos_t { public: @@ -1339,48 +1336,18 @@ using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t> template <typename key_t, typename val_t> class PhysicalNodeMapping { public: + PhysicalNodeMapping() = default; + PhysicalNodeMapping(const PhysicalNodeMapping&) = delete; virtual extent_len_t get_length() const = 0; - virtual extent_types_t get_type() const = 0; virtual val_t get_val() const = 0; virtual key_t get_key() const = 0; - virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0; - virtual PhysicalNodeMappingRef<key_t, val_t> refresh_with_pending_parent() { - ceph_abort("impossible"); - return {}; - } virtual bool has_been_invalidated() const = 0; virtual CachedExtentRef get_parent() const = 0; virtual uint16_t get_pos() const = 0; - // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h - virtual bool is_indirect() const { return false; } - virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; } - virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; } - virtual extent_len_t get_intermediate_length() const { return 0; } virtual uint32_t get_checksum() const { ceph_abort("impossible"); return 0; } - // The start offset of the pin, must be 0 if the pin is not indirect - virtual extent_len_t get_intermediate_offset() const { - return std::numeric_limits<extent_len_t>::max(); - } - - virtual get_child_ret_t<LogicalCachedExtent> - get_logical_extent(Transaction &t) = 0; - - void link_child(ChildableCachedExtent *c) { - ceph_assert(child_pos); - child_pos->link_child(c); - } - - // For reserved mappings, the return values are - // undefined although it won't crash - virtual bool is_stable() const = 0; - virtual bool is_data_stable() const = 0; - virtual bool is_clone() const = 0; - bool is_zero_reserved() const { - return !get_val().is_real(); - } virtual bool is_parent_viewable() const = 0; virtual bool is_parent_valid() const = 0; virtual bool parent_modified() const { @@ -1393,24 +1360,8 @@ public: } virtual ~PhysicalNodeMapping() {} -protected: - std::optional<child_pos_t> child_pos = std::nullopt; }; -using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>; -using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>; - -std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs); - -using lba_pin_list_t = std::list<LBAMappingRef>; - -std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); - -using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>; -using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>; - -using backref_pin_list_t = std::list<BackrefMappingRef>; - /** * RetiredExtentPlaceholder * @@ -1524,6 +1475,8 @@ private: return out; } }; + +class LBAMapping; /** * LogicalCachedExtent * @@ -1558,11 +1511,7 @@ public: laddr = nladdr; } - void maybe_set_intermediate_laddr(LBAMapping &mapping) { - laddr = mapping.is_indirect() - ? mapping.get_intermediate_base() - : mapping.get_key(); - } + void maybe_set_intermediate_laddr(LBAMapping &mapping); void apply_delta_and_adjust_crc( paddr_t base, const ceph::bufferlist &bl) final { @@ -1662,8 +1611,6 @@ using lextent_list_t = addr_extent_list_base_t< } #if FMT_VERSION >= 90000 -template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {}; template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {}; template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {}; -template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {}; #endif diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h index a5c9029c43c..298935bd22e 100644 --- a/src/crimson/os/seastore/journal.h +++ b/src/crimson/os/seastore/journal.h @@ -59,13 +59,13 @@ public: crimson::ct_error::erange, crimson::ct_error::input_output_error >; - using submit_record_ret = submit_record_ertr::future< - record_locator_t - >; - virtual submit_record_ret submit_record( + using on_submission_func_t = std::function< + void(record_locator_t)>; + virtual submit_record_ertr::future<> submit_record( record_t &&record, - OrderingHandle &handle - ) = 0; + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) = 0; /** * flush @@ -101,9 +101,6 @@ public: virtual replay_ret replay( delta_handler_t &&delta_handler) = 0; - virtual seastar::future<> finish_commit( - transaction_type_t type) = 0; - virtual ~Journal() {} virtual backend_type_t get_type() = 0; diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc index 9ee8b1b997f..41ff8318aba 100644 --- a/src/crimson/os/seastore/journal/circular_bounded_journal.cc +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc @@ -58,35 +58,52 @@ CircularBoundedJournal::close_ertr::future<> CircularBoundedJournal::close() return record_submitter.close(); } -CircularBoundedJournal::submit_record_ret +CircularBoundedJournal::submit_record_ertr::future<> CircularBoundedJournal::submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) { LOG_PREFIX(CircularBoundedJournal::submit_record); DEBUG("H{} {} start ...", (void*)&handle, record); assert(write_pipeline); - return do_submit_record(std::move(record), handle); + return do_submit_record( + std::move(record), handle, std::move(on_submission) + ).safe_then([this, t_src] { + if (is_trim_transaction(t_src)) { + return update_journal_tail( + trimmer.get_dirty_tail(), + trimmer.get_alloc_tail()); + } else { + return seastar::now(); + } + }); } -CircularBoundedJournal::submit_record_ret +CircularBoundedJournal::submit_record_ertr::future<> CircularBoundedJournal::do_submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + on_submission_func_t &&on_submission) { LOG_PREFIX(CircularBoundedJournal::do_submit_record); if (!record_submitter.is_available()) { DEBUG("H{} wait ...", (void*)&handle); return record_submitter.wait_available( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } auto action = record_submitter.check_action(record.size); if (action == RecordSubmitter::action_t::ROLL) { return record_submitter.roll_segment( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } @@ -99,13 +116,16 @@ CircularBoundedJournal::do_submit_record( return handle.enter(write_pipeline->device_submission ).then([submit_fut=std::move(submit_ret.future)]() mutable { return std::move(submit_fut); - }).safe_then([FNAME, this, &handle](record_locator_t result) { + }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission) + ](record_locator_t result) mutable { return handle.enter(write_pipeline->finalize - ).then([FNAME, this, result, &handle] { + ).then([FNAME, this, result, &handle, + on_submission=std::move(on_submission)] { DEBUG("H{} finish with {}", (void*)&handle, result); auto new_committed_to = result.write_result.get_end_seq(); record_submitter.update_committed_to(new_committed_to); - return result; + std::invoke(on_submission, result); + return seastar::now(); }); }); } @@ -392,13 +412,4 @@ Journal::replay_ret CircularBoundedJournal::replay( }); } -seastar::future<> CircularBoundedJournal::finish_commit(transaction_type_t type) { - if (is_trim_transaction(type)) { - return update_journal_tail( - trimmer.get_dirty_tail(), - trimmer.get_alloc_tail()); - } - return seastar::now(); -} - } diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.h b/src/crimson/os/seastore/journal/circular_bounded_journal.h index 874bd8dc086..16278df6cfe 100644 --- a/src/crimson/os/seastore/journal/circular_bounded_journal.h +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.h @@ -80,9 +80,11 @@ public: return backend_type_t::RANDOM_BLOCK; } - submit_record_ret submit_record( + submit_record_ertr::future<> submit_record( record_t &&record, - OrderingHandle &handle + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission ) final; seastar::future<> flush( @@ -148,8 +150,6 @@ public: return cjs.get_records_start(); } - seastar::future<> finish_commit(transaction_type_t type) final; - using cbj_delta_handler_t = std::function< replay_ertr::future<bool>( const record_locator_t&, @@ -160,7 +160,10 @@ public: cbj_delta_handler_t &&delta_handler, journal_seq_t tail); - submit_record_ret do_submit_record(record_t &&record, OrderingHandle &handle); + submit_record_ertr::future<> do_submit_record( + record_t &&record, + OrderingHandle &handle, + on_submission_func_t &&on_submission); void try_read_rolled_header(scan_valid_records_cursor &cursor) { paddr_t addr = convert_abs_addr_to_paddr( diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc index 6be2ad4936a..67c0b3fb8ac 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.cc +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -368,25 +368,30 @@ seastar::future<> SegmentedJournal::flush(OrderingHandle &handle) }); } -SegmentedJournal::submit_record_ret +SegmentedJournal::submit_record_ertr::future<> SegmentedJournal::do_submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + on_submission_func_t &&on_submission) { LOG_PREFIX(SegmentedJournal::do_submit_record); if (!record_submitter.is_available()) { DEBUG("H{} wait ...", (void*)&handle); return record_submitter.wait_available( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } auto action = record_submitter.check_action(record.size); if (action == RecordSubmitter::action_t::ROLL) { DEBUG("H{} roll, unavailable ...", (void*)&handle); return record_submitter.roll_segment( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } else { // SUBMIT_FULL/NOT_FULL DEBUG("H{} submit {} ...", @@ -398,22 +403,27 @@ SegmentedJournal::do_submit_record( return handle.enter(write_pipeline->device_submission ).then([submit_fut=std::move(submit_ret.future)]() mutable { return std::move(submit_fut); - }).safe_then([FNAME, this, &handle](record_locator_t result) { + }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission) + ](record_locator_t result) mutable { return handle.enter(write_pipeline->finalize - ).then([FNAME, this, result, &handle] { + ).then([FNAME, this, result, &handle, + on_submission=std::move(on_submission)] { DEBUG("H{} finish with {}", (void*)&handle, result); auto new_committed_to = result.write_result.get_end_seq(); record_submitter.update_committed_to(new_committed_to); - return result; + std::invoke(on_submission, result); + return seastar::now(); }); }); } } -SegmentedJournal::submit_record_ret +SegmentedJournal::submit_record_ertr::future<> SegmentedJournal::submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) { LOG_PREFIX(SegmentedJournal::submit_record); DEBUG("H{} {} start ...", (void*)&handle, record); @@ -429,7 +439,8 @@ SegmentedJournal::submit_record( return crimson::ct_error::erange::make(); } - return do_submit_record(std::move(record), handle); + return do_submit_record( + std::move(record), handle, std::move(on_submission)); } } diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h index 891de7ec306..3f51de70fb3 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.h +++ b/src/crimson/os/seastore/journal/segmented_journal.h @@ -44,9 +44,11 @@ public: close_ertr::future<> close() final; - submit_record_ret submit_record( + submit_record_ertr::future<> submit_record( record_t &&record, - OrderingHandle &handle) final; + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) final; seastar::future<> flush(OrderingHandle &handle) final; @@ -59,9 +61,6 @@ public: backend_type_t get_type() final { return backend_type_t::SEGMENTED; } - seastar::future<> finish_commit(transaction_type_t type) { - return seastar::now(); - } bool is_checksum_needed() final { // segmented journal always requires checksum @@ -69,10 +68,10 @@ public: } private: - submit_record_ret do_submit_record( + submit_record_ertr::future<> do_submit_record( record_t &&record, - OrderingHandle &handle - ); + OrderingHandle &handle, + on_submission_func_t &&on_submission); SegmentSeqAllocatorRef segment_seq_allocator; SegmentAllocator journal_segment_allocator; diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index a050b2cdf47..9a34bf56157 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -19,6 +19,7 @@ #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_mapping.h" namespace crimson::os::seastore { diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 007737ff450..888d3c359ac 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -94,6 +94,45 @@ void unlink_phy_tree_root_node<laddr_t>(RootBlockRef &root_block) { namespace crimson::os::seastore::lba_manager::btree { +get_child_ret_t<LogicalCachedExtent> +BtreeLBAMapping::get_logical_extent(Transaction &t) +{ + ceph_assert(is_parent_viewable()); + assert(pos != std::numeric_limits<uint16_t>::max()); + ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id()); + auto &p = static_cast<LBALeafNode&>(*parent); + auto k = this->is_indirect() + ? this->get_intermediate_base() + : get_key(); + auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k); + if (!v.has_child()) { + this->child_pos = v.get_child_pos(); + } + return v; +} + +bool BtreeLBAMapping::is_stable() const +{ + assert(!this->parent_modified()); + assert(pos != std::numeric_limits<uint16_t>::max()); + auto &p = static_cast<LBALeafNode&>(*parent); + auto k = this->is_indirect() + ? this->get_intermediate_base() + : get_key(); + return p.is_child_stable(ctx, pos, k); +} + +bool BtreeLBAMapping::is_data_stable() const +{ + assert(!this->parent_modified()); + assert(pos != std::numeric_limits<uint16_t>::max()); + auto &p = static_cast<LBALeafNode&>(*parent); + auto k = this->is_indirect() + ? this->get_intermediate_base() + : get_key(); + return p.is_child_data_stable(ctx, pos, k); +} + BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( Transaction &t) diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index ef10ff9623b..e0902053d0e 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -23,11 +23,15 @@ #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" #include "crimson/os/seastore/btree/btree_range_pin.h" +namespace crimson::os::seastore { +class LogicalCachedExtent; +} + namespace crimson::os::seastore::lba_manager::btree { struct LBALeafNode; -class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { +class BtreeLBAMapping : public LBAMapping { // To support cloning, there are two kinds of lba mappings: // 1. physical lba mapping: the pladdr in the value of which is the paddr of // the corresponding extent; @@ -61,14 +65,14 @@ class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { // their keys. public: BtreeLBAMapping(op_context_t<laddr_t> ctx) - : BtreeNodeMapping(ctx) {} + : LBAMapping(ctx) {} BtreeLBAMapping( op_context_t<laddr_t> c, LBALeafNodeRef parent, uint16_t pos, lba_map_val_t &val, lba_node_meta_t meta) - : BtreeNodeMapping( + : LBAMapping( c, parent, pos, @@ -190,8 +194,12 @@ public: SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin)); return new_pin; } + bool is_stable() const final; + bool is_data_stable() const final; + get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction &t); + protected: - std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate( + LBAMappingRef _duplicate( op_context_t<laddr_t> ctx) const final { auto pin = std::unique_ptr<BtreeLBAMapping>(new BtreeLBAMapping(ctx)); pin->key = key; diff --git a/src/crimson/os/seastore/lba_mapping.cc b/src/crimson/os/seastore/lba_mapping.cc new file mode 100644 index 00000000000..90fae09ce21 --- /dev/null +++ b/src/crimson/os/seastore/lba_mapping.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "lba_mapping.h" + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) +{ + out << "LBAMapping(" << rhs.get_key() + << "~0x" << std::hex << rhs.get_length() << std::dec + << "->" << rhs.get_val(); + if (rhs.is_indirect()) { + out << ",indirect(" << rhs.get_intermediate_base() + << "~0x" << std::hex << rhs.get_intermediate_length() + << "@0x" << rhs.get_intermediate_offset() << std::dec + << ")"; + } + out << ")"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) +{ + bool first = true; + out << '['; + for (const auto &i: rhs) { + out << (first ? "" : ",") << *i; + first = false; + } + return out << ']'; +} + +LBAMappingRef LBAMapping::duplicate() const { + auto ret = _duplicate(ctx); + ret->range = range; + ret->value = value; + ret->parent = parent; + ret->len = len; + ret->pos = pos; + return ret; +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h new file mode 100644 index 00000000000..338d4d53f55 --- /dev/null +++ b/src/crimson/os/seastore/lba_mapping.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/btree/btree_range_pin.h" + +namespace crimson::os::seastore { + +class LBAMapping; +using LBAMappingRef = std::unique_ptr<LBAMapping>; + +class LogicalCachedExtent; + +class LBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { +public: + LBAMapping(op_context_t<laddr_t> ctx) + : BtreeNodeMapping<laddr_t, paddr_t>(ctx) {} + template <typename... T> + LBAMapping(T&&... t) + : BtreeNodeMapping<laddr_t, paddr_t>(std::forward<T>(t)...) + { + if (!parent->is_pending()) { + this->child_pos = {parent, pos}; + } + } + + // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h + virtual bool is_indirect() const = 0; + virtual laddr_t get_intermediate_key() const = 0; + virtual laddr_t get_intermediate_base() const = 0; + virtual extent_len_t get_intermediate_length() const = 0; + // The start offset of the pin, must be 0 if the pin is not indirect + virtual extent_len_t get_intermediate_offset() const = 0; + + virtual get_child_ret_t<LogicalCachedExtent> + get_logical_extent(Transaction &t) = 0; + + void link_child(ChildableCachedExtent *c) { + ceph_assert(child_pos); + child_pos->link_child(c); + } + virtual LBAMappingRef refresh_with_pending_parent() = 0; + + // For reserved mappings, the return values are + // undefined although it won't crash + virtual bool is_stable() const = 0; + virtual bool is_data_stable() const = 0; + virtual bool is_clone() const = 0; + bool is_zero_reserved() const { + return !get_val().is_real(); + } + + LBAMappingRef duplicate() const; + + virtual ~LBAMapping() {} +protected: + virtual LBAMappingRef _duplicate(op_context_t<laddr_t>) const = 0; + std::optional<child_pos_t> child_pos = std::nullopt; +}; + +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs); +using lba_pin_list_t = std::list<LBAMappingRef>; + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); + +} // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 5b51083f344..6a866cb1f9b 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -408,6 +408,7 @@ SeaStore::Shard::mkfs_managers() return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "mkfs_seastore", + CACHE_HINT_TOUCH, [this](auto& t) { LOG_PREFIX(SeaStoreS::mkfs_managers); @@ -897,9 +898,10 @@ get_ranges(CollectionRef ch, seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> SeaStore::Shard::list_objects(CollectionRef ch, - const ghobject_t& start, - const ghobject_t& end, - uint64_t limit) const + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit, + uint32_t op_flags) const { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -910,13 +912,14 @@ SeaStore::Shard::list_objects(CollectionRef ch, return seastar::do_with( RetType(std::vector<ghobject_t>(), start), std::move(limit), - [this, ch, start, end](auto& ret, auto& limit) { - return repeat_eagain([this, ch, start, end, &limit, &ret] { + [this, ch, start, end, op_flags](auto& ret, auto& limit) { + return repeat_eagain([this, ch, start, end, &limit, &ret, op_flags] { ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( Transaction::src_t::READ, "list_objects", + op_flags, [this, ch, start, end, &limit, &ret](auto &t) { LOG_PREFIX(SeaStoreS::list_objects); @@ -1054,6 +1057,7 @@ SeaStore::Shard::list_collections() return transaction_manager->with_transaction_intr( Transaction::src_t::READ, "list_collections", + CACHE_HINT_TOUCH, [this, &ret](auto& t) { LOG_PREFIX(SeaStoreS::list_collections); @@ -1137,6 +1141,7 @@ SeaStore::Shard::read( Transaction::src_t::READ, "read", op_type_t::READ, + op_flags, [this, offset, len, op_flags](auto &t, auto &onode) { return _read(t, onode, offset, len, op_flags); }).finally([this] { @@ -1148,7 +1153,8 @@ SeaStore::Shard::read( SeaStore::Shard::base_errorator::future<bool> SeaStore::Shard::exists( CollectionRef c, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { LOG_PREFIX(SeaStoreS::exists); ++(shard_stats.read_num); @@ -1160,6 +1166,7 @@ SeaStore::Shard::exists( Transaction::src_t::READ, "exists", op_type_t::READ, + op_flags, [FNAME](auto& t, auto&) { DEBUGT("exists", t); return seastar::make_ready_future<bool>(true); @@ -1240,7 +1247,8 @@ SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> SeaStore::Shard::get_attr( CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1251,6 +1259,7 @@ SeaStore::Shard::get_attr( Transaction::src_t::READ, "get_attr", op_type_t::GET_ATTR, + op_flags, [this, name](auto &t, auto& onode) { return _get_attr(t, onode, name); }).handle_error( @@ -1296,7 +1305,8 @@ SeaStore::Shard::_get_attrs( SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t> SeaStore::Shard::get_attrs( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1307,6 +1317,7 @@ SeaStore::Shard::get_attrs( Transaction::src_t::READ, "get_attrs", op_type_t::GET_ATTRS, + op_flags, [this](auto &t, auto& onode) { return _get_attrs(t, onode); }).handle_error( @@ -1338,7 +1349,8 @@ seastar::future<struct stat> SeaStore::Shard::_stat( seastar::future<struct stat> SeaStore::Shard::stat( CollectionRef c, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1349,6 +1361,7 @@ seastar::future<struct stat> SeaStore::Shard::stat( Transaction::src_t::READ, "stat", op_type_t::STAT, + op_flags, [this, oid](auto &t, auto &onode) { return _stat(t, onode, oid); }).handle_error( @@ -1364,9 +1377,10 @@ seastar::future<struct stat> SeaStore::Shard::stat( SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> SeaStore::Shard::omap_get_header( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { - return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY); + return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY, op_flags); } SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t> @@ -1389,7 +1403,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t> SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const omap_keys_t &keys) + const omap_keys_t &keys, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1400,6 +1415,7 @@ SeaStore::Shard::omap_get_values( Transaction::src_t::READ, "omap_get_values", op_type_t::OMAP_GET_VALUES, + op_flags, [this, keys](auto &t, auto &onode) { return do_omap_get_values(t, onode, keys); }).finally([this] { @@ -1529,7 +1545,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t> SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional<std::string> &start) + const std::optional<std::string> &start, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1540,6 +1557,7 @@ SeaStore::Shard::omap_get_values( Transaction::src_t::READ, "omap_get_values2", op_type_t::OMAP_GET_VALUES2, + op_flags, [this, start](auto &t, auto &onode) { return do_omap_get_values(t, onode, start); }).finally([this] { @@ -1589,7 +1607,8 @@ SeaStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1600,6 +1619,7 @@ SeaStore::Shard::fiemap( Transaction::src_t::READ, "fiemap", op_type_t::READ, + op_flags, [this, off, len](auto &t, auto &onode) { return _fiemap(t, onode, off, len); }).finally([this] { @@ -2677,6 +2697,7 @@ seastar::future<> SeaStore::Shard::write_meta( return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "write_meta", + CACHE_HINT_NOCACHE, [this, &key, &value](auto& t) { LOG_PREFIX(SeaStoreS::write_meta); diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index fd7e177da63..e2a993b9e20 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -101,7 +101,8 @@ public: seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<ceph::bufferlist> read( CollectionRef c, @@ -118,32 +119,38 @@ public: base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const final; + std::string_view name, + uint32_t op_flags = 0) const final; get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; /// Retrieves paged set of values > start (if present) read_errorator::future<omap_values_paged_t> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; ///< @return <done, values> values.empty() iff done get_attr_errorator::future<bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; /// std::get<1>(ret) returns end if and only if the listing has listed all /// the items within the range, otherwise it returns the next key to be listed. @@ -151,7 +158,8 @@ public: CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; seastar::future<CollectionRef> open_collection(const coll_t& cid) final; @@ -170,7 +178,8 @@ public: CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags = 0) final; unsigned get_max_attr_name_length() const final { return 256; @@ -251,7 +260,8 @@ public: return seastar::do_with( internal_context_t( ch, std::move(t), - transaction_manager->create_transaction(src, tname)), + transaction_manager->create_transaction( + src, tname, t.get_fadvise_flags())), std::forward<F>(f), [this, op_type](auto &ctx, auto &f) { assert(shard_stats.starting_io_num); @@ -298,20 +308,22 @@ public: Transaction::src_t src, const char* tname, op_type_t op_type, + cache_hint_t cache_hint_flags, F &&f) const { auto begin_time = std::chrono::steady_clock::now(); return seastar::do_with( oid, Ret{}, std::forward<F>(f), - [this, ch, src, op_type, begin_time, tname + [this, ch, src, op_type, begin_time, tname, cache_hint_flags ](auto &oid, auto &ret, auto &f) { - return repeat_eagain([&, this, ch, src, tname] { + return repeat_eagain([&, this, ch, src, tname, cache_hint_flags] { assert(src == Transaction::src_t::READ); ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( src, tname, + cache_hint_flags, [&, this, ch, tname](auto& t) { LOG_PREFIX(SeaStoreS::repeat_with_onode); diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 7c7a6833006..5930469ca07 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -20,9 +20,42 @@ #include "include/intarith.h" #include "include/interval_set.h" #include "include/uuid.h" +#include "include/rados.h" namespace crimson::os::seastore { +class cache_hint_t { + enum hint_t { + TOUCH, + NOCACHE + }; +public: + static constexpr cache_hint_t get_touch() { + return hint_t::TOUCH; + } + static constexpr cache_hint_t get_nocache() { + return hint_t::NOCACHE; + } + cache_hint_t(uint32_t flags) { + if (unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) || + unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) { + hint = NOCACHE; + } + } + bool operator==(const cache_hint_t &other) const { + return hint == other.hint; + } + bool operator!=(const cache_hint_t &other) const { + return hint != other.hint; + } +private: + constexpr cache_hint_t(hint_t hint) : hint(hint) {} + hint_t hint = hint_t::TOUCH; +}; + +inline constexpr cache_hint_t CACHE_HINT_TOUCH = cache_hint_t::get_touch(); +inline constexpr cache_hint_t CACHE_HINT_NOCACHE = cache_hint_t::get_nocache(); + /* using a special xattr key "omap_header" to store omap header */ const std::string OMAP_HEADER_XATTR_KEY = "omap_header"; @@ -1228,7 +1261,6 @@ constexpr laddr_t L_ADDR_MAX = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX); constexpr laddr_t L_ADDR_MIN = laddr_t::from_raw_uint(0); constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX; constexpr laddr_t L_ADDR_ROOT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 1); -constexpr laddr_t L_ADDR_LBAT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 2); struct __attribute__((packed)) laddr_le_t { ceph_le64 laddr; @@ -1469,6 +1501,23 @@ constexpr bool is_physical_type(extent_types_t type) { } } +constexpr bool is_backref_mapped_type(extent_types_t type) { + if ((type >= extent_types_t::LADDR_INTERNAL && + type <= extent_types_t::OBJECT_DATA_BLOCK) || + type == extent_types_t::TEST_BLOCK || + type == extent_types_t::TEST_BLOCK_PHYSICAL) { + assert(is_logical_type(type) || + is_lba_node(type) || + type == extent_types_t::TEST_BLOCK_PHYSICAL); + return true; + } else { + assert(!is_logical_type(type) && + !is_lba_node(type) && + type != extent_types_t::TEST_BLOCK_PHYSICAL); + return false; + } +} + constexpr bool is_real_type(extent_types_t type) { if (type <= extent_types_t::OBJECT_DATA_BLOCK || (type >= extent_types_t::TEST_BLOCK && @@ -1945,12 +1994,13 @@ struct __attribute__((packed)) root_t { struct alloc_blk_t { alloc_blk_t( - paddr_t paddr, - laddr_t laddr, + const paddr_t& paddr, + const laddr_t& laddr, extent_len_t len, extent_types_t type) - : paddr(paddr), laddr(laddr), len(len), type(type) - {} + : paddr(paddr), laddr(laddr), len(len), type(type) { + assert(len > 0); + } explicit alloc_blk_t() = default; @@ -1966,6 +2016,25 @@ struct alloc_blk_t { denc(v.type, p); DENC_FINISH(p); } + + static alloc_blk_t create_alloc( + const paddr_t& paddr, + const laddr_t& laddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type)); + assert(laddr != L_ADDR_NULL); + return alloc_blk_t(paddr, laddr, len, type); + } + + static alloc_blk_t create_retire( + const paddr_t& paddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type) || + is_retired_placeholder_type(type)); + return alloc_blk_t(paddr, L_ADDR_NULL, len, type); + } }; // use absolute address diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 9b95161a404..cd8c333c69f 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -8,11 +8,12 @@ #include <boost/intrusive/list.hpp> #include "crimson/common/log.h" +#include "crimson/os/seastore/backref_entry.h" +#include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/logging.h" #include "crimson/os/seastore/ordering_handle.h" -#include "crimson/os/seastore/seastore_types.h" -#include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/transaction_interruptor.h" namespace crimson::os::seastore { @@ -408,12 +409,14 @@ public: src_t src, journal_seq_t initiated_after, on_destruct_func_t&& f, - transaction_id_t trans_id + transaction_id_t trans_id, + cache_hint_t cache_hint ) : weak(weak), handle(std::move(handle)), on_destruct(std::move(f)), src(src), - trans_id(trans_id) + trans_id(trans_id), + cache_hint(cache_hint) {} void invalidate_clear_write_set() { @@ -460,6 +463,7 @@ public: ool_write_stats = {}; rewrite_stats = {}; conflicted = false; + assert(backref_entries.empty()); if (!has_reset) { has_reset = true; } @@ -571,10 +575,23 @@ public: return pre_alloc_list; } + cache_hint_t get_cache_hint() const { + return cache_hint; + } + private: friend class Cache; friend Ref make_test_transaction(); + void set_backref_entries(backref_entry_refs_t&& entries) { + assert(backref_entries.empty()); + backref_entries = std::move(entries); + } + + backref_entry_refs_t move_backref_entries() { + return std::move(backref_entries); + } + /** * If set, *this may not be used to perform writes and will not provide * consistentency allowing operations using to avoid maintaining a read_set. @@ -669,6 +686,10 @@ private: transaction_id_t trans_id = TRANS_ID_NULL; seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool; + + backref_entry_refs_t backref_entries; + + cache_hint_t cache_hint = CACHE_HINT_TOUCH; }; using TransactionRef = Transaction::Ref; @@ -681,7 +702,8 @@ inline TransactionRef make_test_transaction() { Transaction::src_t::MUTATE, JOURNAL_SEQ_NULL, [](Transaction&) {}, - ++next_id + ++next_id, + CACHE_HINT_TOUCH ); } diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 94e9b3b9ab1..807d88b2cbc 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -66,6 +66,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() return with_transaction_intr( Transaction::src_t::MUTATE, "mkfs_tm", + CACHE_HINT_TOUCH, [this, FNAME](auto& t) { cache->init(); @@ -131,6 +132,7 @@ TransactionManager::mount() journal->get_trimmer().set_journal_head(start_seq); return with_transaction_weak( "mount", + CACHE_HINT_TOUCH, [this](auto &t) { return cache->init_cached_extents(t, [this](auto &t, auto &e) { @@ -461,8 +463,12 @@ TransactionManager::do_submit_transaction( } SUBTRACET(seastore_t, "submitting record", tref); - return journal->submit_record(std::move(record), tref.get_handle() - ).safe_then([this, FNAME, &tref](auto submit_result) mutable { + return journal->submit_record( + std::move(record), + tref.get_handle(), + tref.get_src(), + [this, FNAME, &tref](record_locator_t submit_result) + { SUBDEBUGT(seastore_t, "committed with {}", tref, submit_result); auto start_seq = submit_result.write_result.start_seq; journal->get_trimmer().set_journal_head(start_seq); @@ -473,10 +479,8 @@ TransactionManager::do_submit_transaction( journal->get_trimmer().update_journal_tails( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); - return journal->finish_commit(tref.get_src() - ).then([&tref] { - return tref.get_handle().complete(); - }); + }).safe_then([&tref] { + return tref.get_handle().complete(); }).handle_error( submit_transaction_iertr::pass_further{}, crimson::ct_error::assert_all{"Hit error submitting to journal"} diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index dc6cc20cf59..e574460894a 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -741,8 +741,9 @@ public: TransactionRef create_transaction( Transaction::src_t src, const char* name, + cache_hint_t cache_hint = CACHE_HINT_TOUCH, bool is_weak=false) final { - return cache->create_transaction(src, name, is_weak); + return cache->create_transaction(src, name, cache_hint, is_weak); } using ExtentCallbackInterface::submit_transaction_direct_ret; diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h index 64544d4c870..ce649303d4f 100644 --- a/src/crimson/osd/backfill_facades.h +++ b/src/crimson/osd/backfill_facades.h @@ -82,6 +82,9 @@ struct PGFacade final : BackfillState::PGFacade { } PGFacade(PG& pg) : pg(pg) {} + std::ostream &print(std::ostream &out) const override { + return out << pg; + } }; } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 837fd2eb2af..f957f072c93 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -8,11 +8,7 @@ #include "crimson/osd/backfill_state.h" #include "osd/osd_types_fmt.h" -namespace { - seastar::logger& logger() { - return crimson::get_logger(ceph_subsys_osd); - } -} +SET_SUBSYS(osd); namespace crimson::osd { @@ -27,22 +23,23 @@ BackfillState::BackfillState( progress_tracker( std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) { - logger().debug("{}:{}", __func__, __LINE__); + LOG_PREFIX(BackfillState::BackfillState); + DEBUGDPP("", *backfill_machine.pg); backfill_machine.initiate(); } template <class S> BackfillState::StateHelper<S>::StateHelper() { - logger().debug("enter {}", - boost::typeindex::type_id<S>().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUGDPP("enter {}", pg(), boost::typeindex::type_id<S>().pretty_name()); } template <class S> BackfillState::StateHelper<S>::~StateHelper() { - logger().debug("exit {}", - boost::typeindex::type_id<S>().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUG("exit {}", boost::typeindex::type_id<S>().pretty_name()); } BackfillState::~BackfillState() = default; @@ -63,13 +60,16 @@ BackfillState::BackfillMachine::~BackfillMachine() = default; BackfillState::Initial::Initial(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Initial::Initial); backfill_state().last_backfill_started = peering_state().earliest_backfill(); - logger().debug("{}: bft={} from {}", - __func__, peering_state().get_backfill_targets(), - backfill_state().last_backfill_started); + DEBUGDPP("{}: bft={} from {}", + pg(), + __func__, + peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); for (const auto& bt : peering_state().get_backfill_targets()) { - logger().debug("{}: target shard {} from {}", - __func__, bt, peering_state().get_peer_last_backfill(bt)); + DEBUGDPP("{}: target shard {} from {}", + pg(), __func__, bt, peering_state().get_peer_last_backfill(bt)); } ceph_assert(peering_state().get_backfill_targets().size()); ceph_assert(!backfill_state().last_backfill_started.is_max()); @@ -80,7 +80,8 @@ BackfillState::Initial::Initial(my_context ctx) boost::statechart::result BackfillState::Initial::react(const BackfillState::Triggered& evt) { - logger().debug("{}: backfill triggered", __func__); + LOG_PREFIX(BackfillState::Initial::react::Triggered); + DEBUGDPP("", pg()); ceph_assert(backfill_state().last_backfill_started == \ peering_state().earliest_backfill()); ceph_assert(peering_state().is_backfilling()); @@ -93,26 +94,10 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) if (Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); + DEBUGDPP("switching to Done state", pg()); return transit<BackfillState::Done>(); } else { - logger().debug("{}: switching to Enqueuing state", __func__); - return transit<BackfillState::Enqueuing>(); - } -} - -boost::statechart::result -BackfillState::Cancelled::react(const BackfillState::Triggered& evt) -{ - logger().debug("{}: backfill re-triggered", __func__); - ceph_assert(peering_state().is_backfilling()); - if (Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); - return transit<BackfillState::Done>(); - } else { - logger().debug("{}: switching to Enqueuing state", __func__); + DEBUGDPP("switching to Enqueuing state", pg()); return transit<BackfillState::Enqueuing>(); } } @@ -120,9 +105,10 @@ BackfillState::Cancelled::react(const BackfillState::Triggered& evt) // -- Enqueuing void BackfillState::Enqueuing::maybe_update_range() { + LOG_PREFIX(BackfillState::Enqueuing::maybe_update_range); if (auto& primary_bi = backfill_state().backfill_info; primary_bi.version >= pg().get_projected_last_update()) { - logger().info("{}: bi is current", __func__); + INFODPP("bi is current", pg()); ceph_assert(primary_bi.version == pg().get_projected_last_update()); } else if (primary_bi.version >= peering_state().get_log_tail()) { if (peering_state().get_pg_log().get_log().empty() && @@ -136,31 +122,31 @@ void BackfillState::Enqueuing::maybe_update_range() ceph_assert(primary_bi.version == eversion_t()); return; } - logger().debug("{}: bi is old, ({}) can be updated with log to {}", - __func__, - primary_bi.version, - pg().get_projected_last_update()); + DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}", + pg(), + primary_bi.version, + pg().get_projected_last_update()); auto func = [&](const pg_log_entry_t& e) { - logger().debug("maybe_update_range(lambda): updating from version {}", - e.version); + DEBUGDPP("maybe_update_range(lambda): updating from version {}", + pg(), e.version); if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { if (e.is_update()) { - logger().debug("maybe_update_range(lambda): {} updated to ver {}", - e.soid, e.version); + DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}", + pg(), e.soid, e.version); primary_bi.objects.erase(e.soid); primary_bi.objects.insert(std::make_pair(e.soid, e.version)); } else if (e.is_delete()) { - logger().debug("maybe_update_range(lambda): {} removed", - e.soid); + DEBUGDPP("maybe_update_range(lambda): {} removed", + pg(), e.soid); primary_bi.objects.erase(e.soid); } } }; - logger().debug("{}: scanning pg log first", __func__); + DEBUGDPP("{}: scanning pg log first", pg()); peering_state().scan_log_after(primary_bi.version, func); - logger().debug("{}: scanning projected log", __func__); + DEBUGDPP("{}: scanning projected log", pg()); pg().get_projected_log().scan_log_after(primary_bi.version, func); primary_bi.version = pg().get_projected_last_update(); } else { @@ -244,6 +230,7 @@ void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( BackfillState::Enqueuing::result_t BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) { + LOG_PREFIX(BackfillState::Enqueuing::remove_on_peers); // set `new_last_backfill_started` to `check` result_t result { {}, check }; for (const auto& bt : peering_state().get_backfill_targets()) { @@ -255,8 +242,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) backfill_listener().enqueue_drop(bt, pbi.begin, version); } } - logger().debug("{}: BACKFILL removing {} from peers {}", - __func__, check, result.pbi_targets); + DEBUGDPP("BACKFILL removing {} from peers {}", + pg(), check, result.pbi_targets); ceph_assert(!result.pbi_targets.empty()); return result; } @@ -264,7 +251,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) BackfillState::Enqueuing::result_t BackfillState::Enqueuing::update_on_peers(const hobject_t& check) { - logger().debug("{}: check={}", __func__, check); + LOG_PREFIX(BackfillState::Enqueuing::update_on_peers); + DEBUGDPP("check={}", pg(), check); const auto& primary_bi = backfill_state().backfill_info; result_t result { {}, primary_bi.begin }; std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills; @@ -325,6 +313,7 @@ bool BackfillState::Enqueuing::Enqueuing::all_emptied( BackfillState::Enqueuing::Enqueuing(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Enqueuing::Enqueuing); auto& primary_bi = backfill_state().backfill_info; // update our local interval to cope with recent changes @@ -334,8 +323,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) // that backfill will be spinning here over and over. For the sake // of performance and complexity we don't synchronize with entire PG. // similar can happen in classical OSD. - logger().warn("{}: bi is old, rescanning of local backfill_info", - __func__); + WARNDPP("bi is old, rescanning of local backfill_info", pg()); post_event(RequestPrimaryScanning{}); return; } else { @@ -347,13 +335,14 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) primary_bi)) { // need to grab one another chunk of the object namespace and restart // the queueing. - logger().debug("{}: reached end for current local chunk", __func__); + DEBUGDPP("reached end for current local chunk", pg()); post_event(RequestPrimaryScanning{}); return; } do { if (!backfill_listener().budget_available()) { + DEBUGDPP("throttle failed, turning to Waiting", pg()); post_event(RequestWaiting{}); return; } else if (should_rescan_replicas(backfill_state().peer_backfill_info, @@ -392,16 +381,25 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) } } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)); - if (backfill_state().progress_tracker->tracked_objects_completed() - && Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - backfill_state().last_backfill_started = hobject_t::get_max(); - backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + DEBUGDPP("reached end for current local chunk", pg()); + post_event(RequestPrimaryScanning{}); + return; + } else { + if (backfill_state().progress_tracker->tracked_objects_completed() + && Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } + DEBUGDPP("reached end for both local and all peers " + "but still has in-flight operations", pg()); + post_event(RequestWaiting{}); } - logger().debug("{}: reached end for both local and all peers " - "but still has in-flight operations", __func__); - post_event(RequestWaiting{}); } // -- PrimaryScanning @@ -416,16 +414,45 @@ BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) boost::statechart::result BackfillState::PrimaryScanning::react(PrimaryScanned evt) { - logger().debug("{}", __func__); + LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned); + DEBUGDPP("", pg()); backfill_state().backfill_info = std::move(evt.result); - return transit<Enqueuing>(); + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill); + DEBUGDPP("suspended within PrimaryScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } boost::statechart::result BackfillState::PrimaryScanning::react(ObjectPushed evt) { - logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::PrimaryScanning::react::ObjectPushed); + DEBUGDPP("PrimaryScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -443,11 +470,11 @@ bool BackfillState::ReplicasScanning::replica_needs_scan( BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::ReplicasScanning::ReplicasScanning); for (const auto& bt : peering_state().get_backfill_targets()) { if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); replica_needs_scan(pbi, backfill_state().backfill_info)) { - logger().debug("{}: scanning peer osd.{} from {}", - __func__, bt, pbi.end); + DEBUGDPP("scanning peer osd.{} from {}", pg(), bt, pbi.end); backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); ceph_assert(waiting_on_backfill.find(bt) == \ @@ -469,8 +496,9 @@ BackfillState::ReplicasScanning::~ReplicasScanning() boost::statechart::result BackfillState::ReplicasScanning::react(ReplicaScanned evt) { - logger().debug("{}: got scan result from osd={}, result={}", - __func__, evt.from, evt.result); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ReplicaScanned); + DEBUGDPP("got scan result from osd={}, result={}", + pg(), evt.from, evt.result); // TODO: maybe we'll be able to move waiting_on_backfill from // the machine to the state. ceph_assert(peering_state().is_backfill_target(evt.from)); @@ -479,12 +507,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) if (waiting_on_backfill.empty()) { ceph_assert(backfill_state().peer_backfill_info.size() == \ peering_state().get_backfill_targets().size()); - return transit<Enqueuing>(); + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } } } else { - // we canceled backfill for a while due to a too full, and this + // we suspended backfill for a while due to a too full, and this // is an extra response from a non-too-full peer - logger().debug("{}: canceled backfill (too full?)", __func__); + DEBUGDPP("suspended backfill (too full?)", pg()); } return discard_event(); } @@ -492,17 +525,30 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) boost::statechart::result BackfillState::ReplicasScanning::react(CancelBackfill evt) { - logger().debug("{}: cancelled within ReplicasScanning", - __func__); - waiting_on_backfill.clear(); - return transit<Cancelled>(); + LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill); + DEBUGDPP("suspended within ReplicasScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } boost::statechart::result BackfillState::ReplicasScanning::react(ObjectPushed evt) { - logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ObjectPushed); + DEBUGDPP("ReplicasScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -517,17 +563,45 @@ BackfillState::Waiting::Waiting(my_context ctx) boost::statechart::result BackfillState::Waiting::react(ObjectPushed evt) { - logger().debug("Waiting::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed); + DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false); - return transit<Enqueuing>();; + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill); + DEBUGDPP("suspended within Waiting", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } // -- Done BackfillState::Done::Done(my_context ctx) : my_base(ctx) { - logger().info("{}: backfill is done", __func__); + LOG_PREFIX(BackfillState::Done::Done); + INFODPP("backfill is done", pg()); backfill_listener().backfilled(); } @@ -537,13 +611,6 @@ BackfillState::Crashed::Crashed() ceph_abort_msg("{}: this should not happen"); } -// -- Cancelled -BackfillState::Cancelled::Cancelled(my_context ctx) - : my_base(ctx) -{ - ceph_assert(peering_state().get_backfill_targets().size()); -} - // ProgressTracker is an intermediary between the BackfillListener and // BackfillMachine + its states. All requests to push or drop an object // are directed through it. The same happens with notifications about @@ -577,8 +644,8 @@ void BackfillState::ProgressTracker::complete_to( const pg_stat_t& stats, bool may_push_to_max) { - logger().debug("{}: obj={}", - __func__, obj); + LOG_PREFIX(BackfillState::ProgressTracker::complete_to); + DEBUGDPP("obj={}", pg(), obj); if (auto completion_iter = registry.find(obj); completion_iter != std::end(registry)) { completion_iter->second = \ @@ -619,4 +686,19 @@ void BackfillState::enqueue_standalone_push( backfill_machine.backfill_listener.enqueue_push(obj, v, peers); } +void BackfillState::enqueue_standalone_delete( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) +{ + progress_tracker->enqueue_drop(obj); + for (auto bt : peers) { + backfill_machine.backfill_listener.enqueue_drop(bt, obj, v); + } +} + +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) { + return pg.print(out); +} + } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 072c91e079d..517a02ea4df 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -62,6 +62,8 @@ struct BackfillState { struct CancelBackfill : sc::event<CancelBackfill> { }; + struct ThrottleAcquired : sc::event<ThrottleAcquired> { + }; private: // internal events struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { @@ -136,34 +138,10 @@ public: explicit Crashed(); }; - struct Cancelled : sc::state<Cancelled, BackfillMachine>, - StateHelper<Cancelled> { - using reactions = boost::mpl::list< - sc::custom_reaction<Triggered>, - sc::custom_reaction<PrimaryScanned>, - sc::custom_reaction<ReplicaScanned>, - sc::custom_reaction<ObjectPushed>, - sc::transition<sc::event_base, Crashed>>; - explicit Cancelled(my_context); - // resume after triggering backfill by on_activate_complete(). - // transit to Enqueuing. - sc::result react(const Triggered&); - sc::result react(const PrimaryScanned&) { - return discard_event(); - } - sc::result react(const ReplicaScanned&) { - return discard_event(); - } - sc::result react(const ObjectPushed&) { - return discard_event(); - } - }; - struct Initial : sc::state<Initial, BackfillMachine>, StateHelper<Initial> { using reactions = boost::mpl::list< sc::custom_reaction<Triggered>, - sc::transition<CancelBackfill, Cancelled>, sc::transition<sc::event_base, Crashed>>; explicit Initial(my_context); // initialize after triggering backfill by on_activate_complete(). @@ -174,12 +152,9 @@ public: struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, StateHelper<Enqueuing> { using reactions = boost::mpl::list< - sc::transition<CancelBackfill, Cancelled>, sc::transition<RequestPrimaryScanning, PrimaryScanning>, sc::transition<RequestReplicasScanning, ReplicasScanning>, sc::transition<RequestWaiting, Waiting>, - sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, sc::transition<sc::event_base, Crashed>>; explicit Enqueuing(my_context); @@ -237,12 +212,15 @@ public: sc::custom_reaction<ObjectPushed>, sc::custom_reaction<PrimaryScanned>, sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, + sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, sc::transition<sc::event_base, Crashed>>; explicit PrimaryScanning(my_context); sc::result react(ObjectPushed); // collect scanning result and transit to Enqueuing. sc::result react(PrimaryScanned); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, @@ -251,6 +229,7 @@ public: sc::custom_reaction<ObjectPushed>, sc::custom_reaction<ReplicaScanned>, sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, sc::transition<RequestDone, Done>, sc::transition<sc::event_base, Crashed>>; explicit ReplicasScanning(my_context); @@ -259,6 +238,7 @@ public: sc::result react(ObjectPushed); sc::result react(ReplicaScanned); sc::result react(CancelBackfill); + sc::result react(Triggered); // indicate whether a particular peer should be scanned to retrieve // BackfillInterval for new range of hobject_t namespace. @@ -277,10 +257,14 @@ public: using reactions = boost::mpl::list< sc::custom_reaction<ObjectPushed>, sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, + sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, + sc::transition<ThrottleAcquired, Enqueuing>, sc::transition<sc::event_base, Crashed>>; explicit Waiting(my_context); sc::result react(ObjectPushed); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct Done : sc::state<Done, BackfillMachine>, @@ -308,6 +292,11 @@ public: const hobject_t &obj, const eversion_t &v, const std::vector<pg_shard_t> &peers); + void enqueue_standalone_delete( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + bool is_triggered() const { return backfill_machine.triggering_event() != nullptr; @@ -325,6 +314,26 @@ public: } } private: + struct backfill_suspend_state_t { + bool suspended = false; + bool should_go_enqueuing = false; + } backfill_suspend_state; + bool is_suspended() const { + return backfill_suspend_state.suspended; + } + void on_suspended() { + ceph_assert(!is_suspended()); + backfill_suspend_state = {true, false}; + } + bool on_resumed() { + auto go_enqueuing = backfill_suspend_state.should_go_enqueuing; + backfill_suspend_state = {false, false}; + return go_enqueuing; + } + void go_enqueuing_on_resume() { + ceph_assert(is_suspended()); + backfill_suspend_state.should_go_enqueuing = true; + } hobject_t last_backfill_started; BackfillInterval backfill_info; std::map<pg_shard_t, BackfillInterval> peer_backfill_info; @@ -405,8 +414,10 @@ struct BackfillState::PGFacade { virtual const eversion_t& get_projected_last_update() const = 0; virtual const PGLog::IndexedLog& get_projected_log() const = 0; + virtual std::ostream &print(std::ostream &out) const = 0; virtual ~PGFacade() {} }; +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg); class BackfillState::ProgressTracker { // TODO: apply_stat, @@ -433,6 +444,9 @@ class BackfillState::ProgressTracker { BackfillListener& backfill_listener() { return backfill_machine.backfill_listener; } + PGFacade& pg() { + return *backfill_machine.pg; + } public: ProgressTracker(BackfillMachine& backfill_machine) @@ -447,3 +461,9 @@ public: }; } // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::BackfillState::PGFacade> + : fmt::ostream_formatter {}; +#endif + diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc index 7ed3dc0e3fe..0bfd3e2266b 100644 --- a/src/crimson/osd/main.cc +++ b/src/crimson/osd/main.cc @@ -202,7 +202,7 @@ int main(int argc, const char* argv[]) true); } auto store = crimson::os::FuturizedStore::create( - local_conf().get_val<std::string>("osd_objectstore"), + local_conf().get_val<std::string>("crimson_osd_objectstore"), local_conf().get_val<std::string>("osd_data"), local_conf().get_config_values()); diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h index 2897a7e1623..394375c1129 100644 --- a/src/crimson/osd/osd_operation.h +++ b/src/crimson/osd/osd_operation.h @@ -62,6 +62,12 @@ struct PGRepopPipeline { struct Process : OrderedExclusivePhaseT<Process> { static constexpr auto type_name = "PGRepopPipeline::process"; } process; + struct WaitCommit : OrderedConcurrentPhaseT<WaitCommit> { + static constexpr auto type_name = "PGRepopPipeline::wait_repop"; + } wait_commit; + struct SendReply : OrderedExclusivePhaseT<SendReply> { + static constexpr auto type_name = "PGRepopPipeline::send_reply"; + } send_reply; }; struct CommonOBCPipeline { @@ -211,6 +217,9 @@ protected: public: static constexpr bool is_trackable = true; + virtual bool requires_pg() const { + return true; + } }; template <class T> @@ -332,6 +341,18 @@ public: with_throttle_while(std::forward<Args>(args)...), *this); } + // Returns std::nullopt if the throttle is acquired immediately, + // returns the future for the acquiring otherwise + std::optional<seastar::future<>> + try_acquire_throttle_now(crimson::osd::scheduler::params_t params) { + if (!max_in_progress || in_progress < max_in_progress) { + ++in_progress; + --pending; + return std::nullopt; + } + return acquire_throttle(params); + } + private: void dump_detail(Formatter *f) const final; diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h index 8dd17bb036d..6a2d7e3ccbd 100644 --- a/src/crimson/osd/osd_operation_external_tracking.h +++ b/src/crimson/osd/osd_operation_external_tracking.h @@ -38,7 +38,10 @@ struct LttngBackend CommonOBCPipeline::WaitRepop::BlockingEvent::Backend, CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, CommonOBCPipeline::SendReply::BlockingEvent::Backend, - PGRepopPipeline::Process::BlockingEvent::Backend + PGRepopPipeline::Process::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent::Backend, + PGRepopPipeline::SendReply::BlockingEvent::Backend { void handle(ClientRequest::StartEvent&, const Operation&) override {} @@ -126,6 +129,20 @@ struct LttngBackend const PGRepopPipeline::Process& blocker) override { } + void handle(PGRepopPipeline::WaitCommit::BlockingEvent& ev, + const Operation& op, + const PGRepopPipeline::WaitCommit& blocker) override { + } + + void handle(PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(PGRepopPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const PGRepopPipeline::SendReply& blocker) override { + } + void handle(ClientRequest::CompletionEvent&, const Operation&) override {} @@ -150,7 +167,10 @@ struct HistoricBackend CommonOBCPipeline::WaitRepop::BlockingEvent::Backend, CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, CommonOBCPipeline::SendReply::BlockingEvent::Backend, - PGRepopPipeline::Process::BlockingEvent::Backend + PGRepopPipeline::Process::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent::Backend, + PGRepopPipeline::SendReply::BlockingEvent::Backend { void handle(ClientRequest::StartEvent&, const Operation&) override {} @@ -246,6 +266,21 @@ struct HistoricBackend const PGRepopPipeline::Process& blocker) override { } + void handle(PGRepopPipeline::WaitCommit::BlockingEvent& ev, + const Operation& op, + const PGRepopPipeline::WaitCommit& blocker) override { + } + + void handle(PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(PGRepopPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const PGRepopPipeline::SendReply& blocker) override { + } + + void handle(ClientRequest::CompletionEvent&, const Operation& op) override { if (crimson::common::local_conf()->osd_op_history_size) { to_client_request(op).put_historic(); diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index 98443bdfc0f..91a6728fd4b 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -42,6 +42,10 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>, unsigned instance_id = 0; public: + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } + /** * instance_handle_t * diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h index e12243ce430..fe4761c4ab4 100644 --- a/src/crimson/osd/osd_operations/logmissing_request.h +++ b/src/crimson/osd/osd_operations/logmissing_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h index 71651d16789..bdb6c2ac6ac 100644 --- a/src/crimson/osd/osd_operations/logmissing_request_reply.h +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h index 85de5c711d6..aa6b8a95a94 100644 --- a/src/crimson/osd/osd_operations/peering_event.h +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -44,6 +44,10 @@ protected: float delay = 0; PGPeeringEvent evt; + epoch_t get_epoch_sent_at() const { + return evt.get_epoch_sent(); + } + const pg_shard_t get_from() const { return from; } @@ -84,6 +88,10 @@ public: evt(std::forward<Args>(args)...) {} + bool requires_pg() const final { + return evt.requires_pg; + } + void print(std::ostream &) const final; void dump_detail(ceph::Formatter* f) const final; seastar::future<> with_pg( diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h index 43be7319545..21702f6ff4f 100644 --- a/src/crimson/osd/osd_operations/pg_advance_map.h +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -50,6 +50,10 @@ public: PGPeeringPipeline::Process::BlockingEvent > tracking_events; + epoch_t get_epoch_sent_at() const { + return to; + } + private: PGPeeringPipeline &peering_pp(PG &pg); }; diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h index 17c2faf97ea..2fe8ff372b3 100644 --- a/src/crimson/osd/osd_operations/recovery_subrequest.h +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -39,6 +39,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return m->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc index 332ec4dfeb7..ec607758c55 100644 --- a/src/crimson/osd/osd_operations/replicated_request.cc +++ b/src/crimson/osd/osd_operations/replicated_request.cc @@ -5,6 +5,7 @@ #include "common/Formatter.h" +#include "crimson/common/coroutine.h" #include "crimson/osd/osd.h" #include "crimson/osd/osd_connection_priv.h" #include "crimson/osd/osd_operation_external_tracking.h" @@ -63,34 +64,52 @@ PGRepopPipeline &RepRequest::repop_pipeline(PG &pg) return pg.repop_pipeline; } +RepRequest::interruptible_future<> RepRequest::with_pg_interruptible( + Ref<PG> pg) +{ + LOG_PREFIX(RepRequest::with_pg_interruptible); + DEBUGI("{}", *this); + co_await this->template enter_stage<interruptor>(repop_pipeline(*pg).process); + co_await interruptor::make_interruptible(this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), req->min_epoch); + })); + + if (pg->can_discard_replica_op(*req)) { + co_return; + } + + auto [commit_fut, reply] = co_await pg->handle_rep_op(req); + + // Transitions from OrderedExclusive->OrderedConcurrent cannot block + this->template enter_stage_sync(repop_pipeline(*pg).wait_commit); + + co_await std::move(commit_fut); + + co_await this->template enter_stage<interruptor>( + repop_pipeline(*pg).send_reply); + + co_await interruptor::make_interruptible( + pg->shard_services.send_to_osd( + req->from.osd, std::move(reply), pg->get_osdmap_epoch()) + ); +} + seastar::future<> RepRequest::with_pg( ShardServices &shard_services, Ref<PG> pg) { LOG_PREFIX(RepRequest::with_pg); - DEBUGI("{}: RepRequest::with_pg", *this); + DEBUGI("{}", *this); IRef ref = this; return interruptor::with_interruption([this, pg] { - LOG_PREFIX(RepRequest::with_pg); - DEBUGI("{}: pg present", *this); - return this->template enter_stage<interruptor>(repop_pipeline(*pg).process - ).then_interruptible([this, pg] { - return this->template with_blocking_event< - PG_OSDMapGate::OSDMapBlocker::BlockingEvent - >([this, pg](auto &&trigger) { - return pg->osdmap_gate.wait_for_map( - std::move(trigger), req->min_epoch); - }); - }).then_interruptible([this, pg] (auto) { - return pg->handle_rep_op(req); - }).then_interruptible([this] { - logger().debug("{}: complete", *this); - return handle.complete(); - }); + return with_pg_interruptible(pg); }, [](std::exception_ptr) { return seastar::now(); }, pg, pg->get_osdmap_epoch()).finally([this, ref=std::move(ref)] { logger().debug("{}: exit", *this); - handle.exit(); + return handle.complete(); }); } diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h index 1e84fd108e2..c2494b3715f 100644 --- a/src/crimson/osd/osd_operations/replicated_request.h +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); @@ -68,6 +71,9 @@ public: r_conn = make_local_shared_foreign(std::move(conn)); } + interruptible_future<> with_pg_interruptible( + Ref<PG> pg); + seastar::future<> with_pg( ShardServices &shard_services, Ref<PG> pg); @@ -78,6 +84,8 @@ public: ConnectionPipeline::GetPGMapping::BlockingEvent, PerShardPipeline::CreateOrWaitPG::BlockingEvent, PGRepopPipeline::Process::BlockingEvent, + PGRepopPipeline::WaitCommit::BlockingEvent, + PGRepopPipeline::SendReply::BlockingEvent, PG_OSDMapGate::OSDMapBlocker::BlockingEvent, PGMap::PGCreationBlockingEvent, OSD_OSDMapGate::OSDMapBlocker::BlockingEvent diff --git a/src/crimson/osd/osd_operations/scrub_events.h b/src/crimson/osd/osd_operations/scrub_events.h index 02a5d852bb7..8bed90e4c14 100644 --- a/src/crimson/osd/osd_operations/scrub_events.h +++ b/src/crimson/osd/osd_operations/scrub_events.h @@ -27,11 +27,11 @@ class RemoteScrubEventBaseT : public PhasedOperationT<T> { crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; - epoch_t epoch; spg_t pgid; protected: using interruptor = InterruptibleOperation::interruptor; + epoch_t epoch; template <typename U=void> using ifut = InterruptibleOperation::interruptible_future<U>; @@ -40,7 +40,7 @@ protected: public: RemoteScrubEventBaseT( crimson::net::ConnectionRef conn, epoch_t epoch, spg_t pgid) - : l_conn(std::move(conn)), epoch(epoch), pgid(pgid) {} + : l_conn(std::move(conn)), pgid(pgid), epoch(epoch) {} PGPeeringPipeline &get_peering_pipeline(PG &pg); @@ -117,6 +117,10 @@ public: : RemoteScrubEventBaseT<ScrubRequested>(std::forward<Args>(base_args)...), deep(deep) {} + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(deep=" << deep << ")"; } @@ -141,6 +145,10 @@ public: ceph_assert(scrub::PGScrubber::is_scrub_message(*m)); } + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(m=" << *m << ")"; } diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index d812d822550..2746e730f2b 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -879,6 +879,17 @@ void PG::enqueue_push_for_backfill( backfill_state->enqueue_standalone_push(obj, v, peers); } +void PG::enqueue_delete_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) +{ + assert(recovery_handler); + assert(recovery_handler->backfill_state); + auto backfill_state = recovery_handler->backfill_state.get(); + backfill_state->enqueue_standalone_delete(obj, v, peers); +} + PG::interruptible_future< std::tuple<PG::interruptible_future<>, PG::interruptible_future<>>> @@ -1215,13 +1226,10 @@ void PG::update_stats(const pg_stat_t &stat) { ); } -PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) +PG::handle_rep_op_fut PG::handle_rep_op(Ref<MOSDRepOp> req) { LOG_PREFIX(PG::handle_rep_op); DEBUGDPP("{}", *this, *req); - if (can_discard_replica_op(*req)) { - co_return; - } ceph::os::Transaction txn; auto encoded_txn = req->get_data().cbegin(); @@ -1243,7 +1251,8 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) txn, false); DEBUGDPP("{} do_transaction", *this, *req); - co_await interruptor::make_interruptible( + + auto commit_fut = interruptor::make_interruptible( shard_services.get_store().do_transaction(coll_ref, std::move(txn)) ); @@ -1254,10 +1263,7 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) req.get(), pg_whoami, 0, map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); reply->set_last_complete_ondisk(lcod); - co_await interruptor::make_interruptible( - shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch) - ); - co_return; + co_return handle_rep_op_ret(std::move(commit_fut), std::move(reply)); } PG::interruptible_future<> PG::update_snap_map( diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index d87d0b2d0e9..06038c0aa00 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -596,7 +596,13 @@ public: using with_obc_func_t = std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>; - interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m); + using handle_rep_op_ret = std::tuple< + interruptible_future<>, // resolves upon commit + MURef<MOSDRepOpReply> // reply message + >; + // outer future resolves upon submission + using handle_rep_op_fut = interruptible_future<handle_rep_op_ret>; + handle_rep_op_fut handle_rep_op(Ref<MOSDRepOp> m); void update_stats(const pg_stat_t &stat); interruptible_future<> update_snap_map( const std::vector<pg_log_entry_t> &log_entries, @@ -898,6 +904,11 @@ private: const hobject_t &obj, const eversion_t &v, const std::vector<pg_shard_t> &peers); + void enqueue_delete_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const; bool can_discard_op(const MOSDOp& m) const; void context_registry_on_change(); diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc index a40b28caa8b..79895de06de 100644 --- a/src/crimson/osd/pg_backend.cc +++ b/src/crimson/osd/pg_backend.cc @@ -1325,9 +1325,10 @@ maybe_get_omap_vals( PGBackend::ll_read_ierrorator::future<ceph::bufferlist> PGBackend::omap_get_header( const crimson::os::CollectionRef& c, - const ghobject_t& oid) const + const ghobject_t& oid, + uint32_t op_flags) const { - return store->omap_get_header(c, oid) + return store->omap_get_header(c, oid, op_flags) .handle_error( crimson::ct_error::enodata::handle([] { return seastar::make_ready_future<bufferlist>(); @@ -1340,10 +1341,13 @@ PGBackend::ll_read_ierrorator::future<> PGBackend::omap_get_header( const ObjectState& os, OSDOp& osd_op, - object_stat_sum_t& delta_stats) const + object_stat_sum_t& delta_stats, + uint32_t op_flags) const { if (os.oi.is_omap()) { - return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible( + return omap_get_header( + coll, ghobject_t{os.oi.soid}, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED + ).safe_then_interruptible( [&delta_stats, &osd_op] (ceph::bufferlist&& header) { osd_op.outdata = std::move(header); delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); @@ -1707,7 +1711,8 @@ PGBackend::fiemap( CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { return store->fiemap(c, oid, off, len); } diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h index c24176a10e7..9c2230375b0 100644 --- a/src/crimson/osd/pg_backend.h +++ b/src/crimson/osd/pg_backend.h @@ -315,7 +315,8 @@ public: CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len); + uint64_t len, + uint32_t op_flags = 0); write_iertr::future<> tmapput( ObjectState& os, @@ -375,11 +376,13 @@ public: object_stat_sum_t& delta_stats); ll_read_ierrorator::future<ceph::bufferlist> omap_get_header( const crimson::os::CollectionRef& c, - const ghobject_t& oid) const; + const ghobject_t& oid, + uint32_t op_flags = 0) const; ll_read_ierrorator::future<> omap_get_header( const ObjectState& os, OSDOp& osd_op, - object_stat_sum_t& delta_stats) const; + object_stat_sum_t& delta_stats, + uint32_t op_flags = 0) const; interruptible_future<> omap_set_header( ObjectState& os, const OSDOp& osd_op, diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index ec3af0d2b00..5eef584c776 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -67,8 +67,6 @@ PGRecovery::start_recovery_ops( if (max_to_start > 0) { max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started); } - using interruptor = - crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; return interruptor::parallel_for_each(started, [] (auto&& ifut) { return std::move(ifut); @@ -609,8 +607,21 @@ void PGRecovery::update_peers_last_backfill( bool PGRecovery::budget_available() const { - // TODO: the limits! - return true; + crimson::osd::scheduler::params_t params = + {1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort}; + auto &ss = pg->get_shard_services(); + auto futopt = ss.try_acquire_throttle_now(std::move(params)); + if (!futopt) { + return true; + } + std::ignore = interruptor::make_interruptible(std::move(*futopt) + ).then_interruptible([this] { + assert(!backfill_state->is_triggered()); + using BackfillState = crimson::osd::BackfillState; + backfill_state->process_event( + BackfillState::ThrottleAcquired{}.intrusive_from_this()); + }); + return false; } void PGRecovery::on_pg_clean() diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h index 657e6d3e888..5c7b5c5ef2b 100644 --- a/src/crimson/osd/pg_recovery.h +++ b/src/crimson/osd/pg_recovery.h @@ -25,6 +25,8 @@ class PGBackend; class PGRecovery : public crimson::osd::BackfillState::BackfillListener { public: + using interruptor = + crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; template <typename T = void> using interruptible_future = RecoveryBackend::interruptible_future<T>; PGRecovery(PGRecoveryListener* pg) : pg(pg) {} diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h index b9879c8c9dd..f7bd7a6c08e 100644 --- a/src/crimson/osd/pg_shard_manager.h +++ b/src/crimson/osd/pg_shard_manager.h @@ -256,18 +256,40 @@ public: auto &opref = *op; return opref.template with_blocking_event< PGMap::PGCreationBlockingEvent - >([&target_shard_services, &opref](auto &&trigger) { - return target_shard_services.wait_for_pg( - std::move(trigger), opref.get_pgid()); - }).safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) { - logger.debug("{}: have_pg", opref); - return opref.with_pg(target_shard_services, pgref); - }).handle_error( - crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { - logger.debug("{}: pg creation canceled, dropping", opref); - return seastar::now(); - }) - ).then([op=std::move(op)] {}); + >([&target_shard_services, &opref, &logger](auto &&trigger) mutable { + auto pg = target_shard_services.get_pg(opref.get_pgid()); + auto fut = ShardServices::wait_for_pg_ertr::make_ready_future<Ref<PG>>(pg); + if (!pg) { + if (opref.requires_pg()) { + auto osdmap = target_shard_services.get_map(); + if (!osdmap->is_up_acting_osd_shard( + opref.get_pgid(), target_shard_services.local_state.whoami)) { + logger.debug( + "pg {} for {} is no longer here, discarding", + opref.get_pgid(), opref); + opref.get_handle().exit(); + auto _fut = seastar::now(); + if (osdmap->get_epoch() > opref.get_epoch_sent_at()) { + _fut = target_shard_services.send_incremental_map( + std::ref(opref.get_foreign_connection()), + opref.get_epoch_sent_at() + 1); + } + return _fut; + } + } + fut = target_shard_services.wait_for_pg( + std::move(trigger), opref.get_pgid()); + } + return fut.safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(target_shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ); + }).then([op=std::move(op)] {}); } seastar::future<> load_pgs(crimson::os::FuturizedStore& store); diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc index f09cd147ea9..6c8abecffaf 100644 --- a/src/crimson/osd/replicated_backend.cc +++ b/src/crimson/osd/replicated_backend.cc @@ -96,11 +96,18 @@ ReplicatedBackend::submit_transaction( bufferlist encoded_txn; encode(txn, encoded_txn); + bool is_delete = false; for (auto &le : log_entries) { le.mark_unrollbackable(); + if (le.is_delete()) { + is_delete = true; + } } + co_await pg.update_snap_map(log_entries, txn); + std::vector<pg_shard_t> to_push_clone; + std::vector<pg_shard_t> to_push_delete; auto sends = std::make_unique<std::vector<seastar::future<>>>(); for (auto &pg_shard : pg_shards) { if (pg_shard == whoami) { @@ -115,12 +122,17 @@ ReplicatedBackend::submit_transaction( m = new_repop_msg( pg_shard, hoid, encoded_txn, osd_op_p, min_epoch, map_epoch, log_entries, false, tid); - if (_new_clone && pg.is_missing_on_peer(pg_shard, hoid)) { - // The head is in the push queue but hasn't been pushed yet. - // We need to ensure that the newly created clone will be - // pushed as well, otherwise we might skip it. - // See: https://tracker.ceph.com/issues/68808 - to_push_clone.push_back(pg_shard); + if (pg.is_missing_on_peer(pg_shard, hoid)) { + if (_new_clone) { + // The head is in the push queue but hasn't been pushed yet. + // We need to ensure that the newly created clone will be + // pushed as well, otherwise we might skip it. + // See: https://tracker.ceph.com/issues/68808 + to_push_clone.push_back(pg_shard); + } + if (is_delete) { + to_push_delete.push_back(pg_shard); + } } } pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); @@ -130,8 +142,6 @@ ReplicatedBackend::submit_transaction( pg_shard.osd, std::move(m), map_epoch)); } - co_await pg.update_snap_map(log_entries, txn); - pg.log_operation( std::move(log_entries), osd_op_p.pg_trim_to, @@ -157,7 +167,8 @@ ReplicatedBackend::submit_transaction( return seastar::now(); } return peers->all_committed.get_shared_future(); - }).then_interruptible([pending_txn, this, _new_clone, + }).then_interruptible([pending_txn, this, _new_clone, &hoid, + to_push_delete=std::move(to_push_delete), to_push_clone=std::move(to_push_clone)] { auto acked_peers = std::move(pending_txn->second.acked_peers); pending_trans.erase(pending_txn); @@ -167,6 +178,9 @@ ReplicatedBackend::submit_transaction( _new_clone->obs.oi.version, to_push_clone); } + if (!to_push_delete.empty()) { + pg.enqueue_delete_for_backfill(hoid, {}, to_push_delete); + } return seastar::make_ready_future< crimson::osd::acked_peers_t>(std::move(acked_peers)); }); diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc index 76f24196b51..0d6c9d38236 100644 --- a/src/crimson/osd/replicated_recovery_backend.cc +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -35,6 +35,15 @@ ReplicatedRecoveryBackend::recover_object( logger().debug("recover_object: loading obc: {}", soid); return pg.obc_loader.with_obc<RWState::RWREAD>(soid, [this, soid, need](auto head, auto obc) { + if (!obc->obs.exists) { + // XXX: this recovery must be triggered by backfills and the corresponding + // object must have been deleted by some client request after the object + // is enqueued for push but before the lock is acquired by the recovery. + // + // Abort the recovery in this case, a "recover_delete" must have been + // added for this object by the client request that deleted it. + return interruptor::now(); + } logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); auto& recovery_waiter = get_recovering(soid); recovery_waiter.obc = obc; @@ -306,7 +315,10 @@ ReplicatedRecoveryBackend::recover_delete( } return seastar::make_ready_future<>(); }).then_interruptible([this, soid, &stat_diff] { - pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + const auto &missing = pg.get_peering_state().get_pg_log().get_missing(); + if (!missing.is_missing(soid)) { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + } return seastar::make_ready_future<>(); }); }); @@ -568,14 +580,17 @@ ReplicatedRecoveryBackend::read_metadata_for_push_op( return seastar::make_ready_future<eversion_t>(ver); } return interruptor::make_interruptible(interruptor::when_all_succeed( - backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>( + backend->omap_get_header( + coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED + ).handle_error_interruptible<false>( crimson::os::FuturizedStore::Shard::read_errorator::all_same_way( [oid] (const std::error_code& e) { logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid); return seastar::make_ready_future<bufferlist>(); })), - interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid))) - .handle_error_interruptible<false>( + interruptor::make_interruptible( + store->get_attrs(coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + ).handle_error_interruptible<false>( crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way( [oid] (const std::error_code& e) { logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid); @@ -613,8 +628,14 @@ ReplicatedRecoveryBackend::read_object_for_push_op( return seastar::make_ready_future<uint64_t>(offset); } // 1. get the extents in the interested range - return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid}, - 0, copy_subset.range_end())).safe_then_interruptible( + return interruptor::make_interruptible( + backend->fiemap( + coll, + ghobject_t{oid}, + 0, + copy_subset.range_end(), + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + ).safe_then_interruptible( [=, this](auto&& fiemap_included) mutable { interval_set<uint64_t> extents; try { @@ -630,8 +651,12 @@ ReplicatedRecoveryBackend::read_object_for_push_op( push_op->data_included.span_of(extents, offset, max_len); // 3. read the truncated extents // TODO: check if the returned extents are pruned - return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid}, - push_op->data_included, 0)); + return interruptor::make_interruptible( + store->readv( + coll, + ghobject_t{oid}, + push_op->data_included, + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)); }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) { push_op->data.claim_append(std::move(bl)); uint64_t recovered_to = 0; diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index c2340898929..e1acb34636f 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -783,6 +783,11 @@ seastar::future<> ShardServices::dispatch_context_transaction( co_return; } +Ref<PG> ShardServices::get_pg(spg_t pgid) +{ + return local_state.get_pg(pgid); +} + seastar::future<> ShardServices::dispatch_context_messages( BufferedRecoveryMessages &&ctx) { diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h index 56ac4963fff..f1ed9b8d911 100644 --- a/src/crimson/osd/shard_services.h +++ b/src/crimson/osd/shard_services.h @@ -483,6 +483,8 @@ public: return pg_to_shard_mapping.remove_pg_mapping(pgid); } + Ref<PG> get_pg(spg_t pgid); + crimson::common::CephContext *get_cct() { return &(local_state.cct); } @@ -589,6 +591,7 @@ public: FORWARD_TO_OSD_SINGLETON(get_pool_info) FORWARD(with_throttle_while, with_throttle_while, local_state.throttler) + FORWARD(try_acquire_throttle_now, try_acquire_throttle_now, local_state.throttler) FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg) FORWARD_TO_OSD_SINGLETON(send_incremental_map) diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc index 389ecd78afc..870809c5153 100644 --- a/src/crimson/tools/store_nbd/tm_driver.cc +++ b/src/crimson/tools/store_nbd/tm_driver.cc @@ -25,6 +25,7 @@ seastar::future<> TMDriver::write( return tm->with_transaction_intr( Transaction::src_t::MUTATE, "write", + CACHE_HINT_TOUCH, [this, offset, &ptr](auto& t) { return tm->remove(t, laddr_t::from_byte_offset(offset) @@ -112,6 +113,7 @@ seastar::future<bufferlist> TMDriver::read( return tm->with_transaction_intr( Transaction::src_t::READ, "read", + CACHE_HINT_TOUCH, [=, &blret, this](auto& t) { return read_extents(t, laddr_t::from_byte_offset(offset), size diff --git a/src/exporter/ceph_exporter.cc b/src/exporter/ceph_exporter.cc index 44b67c7e615..2232851c094 100644 --- a/src/exporter/ceph_exporter.cc +++ b/src/exporter/ceph_exporter.cc @@ -30,13 +30,13 @@ static void handle_signal(int signum) static void usage() { std::cout << "usage: ceph-exporter [options]\n" << "options:\n" - " --sock-dir: The path to ceph daemons socket files dir\n" - " --addrs: Host ip address where exporter is deployed\n" - " --port: Port to deploy exporter on. Default is 9926\n" - " --cert-file: Path to the certificate file to use https\n" - " --key-file: Path to the certificate key file to use https\n" + " --sock-dir: The path to Ceph daemon sockets (*.asok)\n" + " --addrs: Host IP address on which the exporter is to listen\n" + " --port: TCP Port on which the exporter is to listen. Default is 9926\n" + " --cert-file: Path to the certificate file when using HTTPS\n" + " --key-file: Path to the certificate key file when using HTTPS\n" " --prio-limit: Only perf counters greater than or equal to prio-limit are fetched. Default: 5\n" - " --stats-period: Time to wait before sending requests again to exporter server (seconds). Default: 5s" + " --stats-period: Interval between daemon scrapes (seconds). Default: 5s" << std::endl; generic_server_usage(); } diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp index 4a7ac3ea6e0..0f5a9036eff 100644 --- a/src/include/rados/librados.hpp +++ b/src/include/rados/librados.hpp @@ -202,6 +202,8 @@ inline namespace v14_2_0 { int set_complete_callback(void *cb_arg, callback_t cb); int set_safe_callback(void *cb_arg, callback_t cb) __attribute__ ((deprecated)); + /// Request immediate cancellation as if by IoCtx::aio_cancel(). + int cancel(); int wait_for_complete(); int wait_for_safe() __attribute__ ((deprecated)); int wait_for_complete_and_cb(); @@ -772,17 +774,30 @@ inline namespace v14_2_0 { void tier_evict(); }; - /* IoCtx : This is a context in which we can perform I/O. - * It includes a Pool, + /** + * @brief A handle to a RADOS pool used to perform I/O operations. * * Typical use (error checking omitted): - * + * @code * IoCtx p; * rados.ioctx_create("my_pool", p); - * p->stat(&stats); - * ... etc ... + * p.stat("my_object", &size, &mtime); + * @endcode + * + * IoCtx holds a pointer to its underlying implementation. The dup() + * method performs a deep copy of this implementation, but the copy + * construction and assignment operations perform shallow copies by + * sharing that pointer. + * + * Function names starting with aio_ are asynchronous operations that + * return immediately after submitting a request, and whose completions + * are managed by the given AioCompletion pointer. The IoCtx's underlying + * implementation is involved in the delivery of these completions, so + * the caller must guarantee that its lifetime is preserved until then - + * if not by preserving the IoCtx instance that submitted the request, + * then by a copied/moved instance that shares the same implementation. * - * NOTE: be sure to call watch_flush() prior to destroying any IoCtx + * @note Be sure to call watch_flush() prior to destroying any IoCtx * that is used for watch events to ensure that racing callbacks * have completed. */ @@ -791,9 +806,13 @@ inline namespace v14_2_0 { public: IoCtx(); static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool); + /// Construct a shallow copy of rhs, sharing its underlying implementation. IoCtx(const IoCtx& rhs); + /// Assign a shallow copy of rhs, sharing its underlying implementation. IoCtx& operator=(const IoCtx& rhs); + /// Move construct from rhs, transferring its underlying implementation. IoCtx(IoCtx&& rhs) noexcept; + /// Move assign from rhs, transferring its underlying implementation. IoCtx& operator=(IoCtx&& rhs) noexcept; ~IoCtx(); @@ -1150,7 +1169,8 @@ inline namespace v14_2_0 { int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts); /** - * Cancel aio operation + * Request immediate cancellation with error code -ECANCELED + * if the operation hasn't already completed. * * @param c completion handle * @returns 0 on success, negative error code on failure diff --git a/src/include/random.h b/src/include/random.h index f2e3e37bcd7..6b7c9405efd 100644 --- a/src/include/random.h +++ b/src/include/random.h @@ -16,9 +16,9 @@ #define CEPH_RANDOM_H 1 #include <mutex> +#include <optional> #include <random> #include <type_traits> -#include <boost/optional.hpp> // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85494 #ifdef __MINGW32__ @@ -123,7 +123,7 @@ void randomize_rng() template <typename EngineT> EngineT& engine() { - thread_local boost::optional<EngineT> rng_engine; + thread_local std::optional<EngineT> rng_engine; if (!rng_engine) { rng_engine.emplace(EngineT()); diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h index 858742d511e..d926840180e 100644 --- a/src/kv/KeyValueDB.h +++ b/src/kv/KeyValueDB.h @@ -9,6 +9,7 @@ #include <map> #include <optional> #include <string> +#include <string_view> #include <boost/scoped_ptr.hpp> #include "include/encoding.h" #include "common/Formatter.h" @@ -211,6 +212,10 @@ public: return ""; } virtual ceph::buffer::list value() = 0; + // When valid() returns true, value returned as string-view + // is guaranteed to be valid until iterator is moved to another + // position; that is until call to next() / seek_to_first() / etc. + virtual std::string_view value_as_sv() = 0; virtual int status() = 0; virtual ~SimplestIteratorImpl() {} }; @@ -220,7 +225,12 @@ public: virtual ~IteratorImpl() {} virtual int seek_to_last() = 0; virtual int prev() = 0; + // When valid() returns true, key returned as string-view + // is guaranteed to be valid until iterator is moved to another + // position; that is until call to next() / seek_to_first() / etc. + virtual std::string_view key_as_sv() = 0; virtual std::pair<std::string, std::string> raw_key() = 0; + virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0; virtual ceph::buffer::ptr value_as_ptr() { ceph::buffer::list bl = value(); if (bl.length() == 1) { @@ -247,7 +257,9 @@ public: virtual int next() = 0; virtual int prev() = 0; virtual std::string key() = 0; + virtual std::string_view key_as_sv() = 0; virtual std::pair<std::string,std::string> raw_key() = 0; + virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0; virtual bool raw_key_is_prefixed(const std::string &prefix) = 0; virtual ceph::buffer::list value() = 0; virtual ceph::buffer::ptr value_as_ptr() { @@ -258,6 +270,7 @@ public: return ceph::buffer::ptr(); } } + virtual std::string_view value_as_sv() = 0; virtual int status() = 0; virtual size_t key_size() { return 0; @@ -315,15 +328,24 @@ private: std::string key() override { return generic_iter->key(); } + std::string_view key_as_sv() override { + return generic_iter->key_as_sv(); + } std::pair<std::string, std::string> raw_key() override { return generic_iter->raw_key(); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return generic_iter->raw_key_as_sv(); + } ceph::buffer::list value() override { return generic_iter->value(); } ceph::buffer::ptr value_as_ptr() override { return generic_iter->value_as_ptr(); } + std::string_view value_as_sv() override { + return generic_iter->value_as_sv(); + } int status() override { return generic_iter->status(); } diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index ca63ea06484..51d224b67c0 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -6,6 +6,7 @@ #include <memory> #include <set> #include <string> +#include <string_view> #include <errno.h> #include <unistd.h> #include <sys/types.h> @@ -47,6 +48,7 @@ using std::ostream; using std::pair; using std::set; using std::string; +using std::string_view; using std::unique_ptr; using std::vector; @@ -1992,7 +1994,7 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) // Find separator inside Slice char* separator = (char*) memchr(in.data(), 0, in.size()); - if (separator == NULL) + if (separator == nullptr) return -EINVAL; prefix_len = size_t(separator - in.data()); if (prefix_len >= in.size()) @@ -2006,6 +2008,27 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) return 0; } +// TODO: deduplicate the code, preferrably by removing the string variant +int RocksDBStore::split_key(rocksdb::Slice in, string_view *prefix, string_view *key) +{ + size_t prefix_len = 0; + + // Find separator inside Slice + char* separator = (char*) memchr(in.data(), 0, in.size()); + if (separator == nullptr) + return -EINVAL; + prefix_len = size_t(separator - in.data()); + if (prefix_len >= in.size()) + return -EINVAL; + + // Fetch prefix and/or key directly from Slice + if (prefix) + *prefix = string_view(in.data(), prefix_len); + if (key) + *key = string_view(separator + 1, in.size() - prefix_len - 1); + return 0; +} + void RocksDBStore::compact() { dout(2) << __func__ << " starting" << dendl; @@ -2226,7 +2249,13 @@ int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev() string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key() { string out_key; - split_key(dbiter->key(), 0, &out_key); + split_key(dbiter->key(), nullptr, &out_key); + return out_key; +} +string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::key_as_sv() +{ + string_view out_key; + split_key(dbiter->key(), nullptr, &out_key); return out_key; } pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() @@ -2235,6 +2264,12 @@ pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() split_key(dbiter->key(), &prefix, &key); return make_pair(prefix, key); } +pair<string_view,string_view> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_as_sv() +{ + string_view prefix, key; + split_key(dbiter->key(), &prefix, &key); + return make_pair(prefix, key); +} bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) { // Look for "prefix\0" right in rocksb::Slice @@ -2267,6 +2302,12 @@ bufferptr RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_ptr() return bufferptr(val.data(), val.size()); } +std::string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_sv() +{ + rocksdb::Slice val = dbiter->value(); + return std::string_view{val.data(), val.size()}; +} + int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status() { return dbiter->status().ok() ? 0 : -1; @@ -2348,9 +2389,15 @@ public: string key() override { return dbiter->key().ToString(); } + string_view key_as_sv() override { + return dbiter->key().ToStringView(); + } std::pair<std::string, std::string> raw_key() override { return make_pair(prefix, key()); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return make_pair(prefix, dbiter->key().ToStringView()); + } bufferlist value() override { return to_bufferlist(dbiter->value()); } @@ -2358,6 +2405,10 @@ public: rocksdb::Slice val = dbiter->value(); return bufferptr(val.data(), val.size()); } + std::string_view value_as_sv() override { + rocksdb::Slice val = dbiter->value(); + return std::string_view{val.data(), val.size()}; + } int status() override { return dbiter->status().ok() ? 0 : -1; } @@ -2668,6 +2719,15 @@ public: } } + std::string_view key_as_sv() override + { + if (smaller == on_main) { + return main->key_as_sv(); + } else { + return current_shard->second->key_as_sv(); + } + } + std::pair<std::string,std::string> raw_key() override { if (smaller == on_main) { @@ -2677,6 +2737,15 @@ public: } } + std::pair<std::string_view,std::string_view> raw_key_as_sv() override + { + if (smaller == on_main) { + return main->raw_key_as_sv(); + } else { + return { current_shard->first, current_shard->second->key_as_sv() }; + } + } + bool raw_key_is_prefixed(const std::string &prefix) override { if (smaller == on_main) { @@ -2695,6 +2764,15 @@ public: } } + std::string_view value_as_sv() override + { + if (smaller == on_main) { + return main->value_as_sv(); + } else { + return current_shard->second->value_as_sv(); + } + } + int status() override { //because we already had to inspect key, it must be ok @@ -3017,9 +3095,15 @@ public: string key() override { return iters[0]->key().ToString(); } + string_view key_as_sv() override { + return iters[0]->key().ToStringView(); + } std::pair<std::string, std::string> raw_key() override { return make_pair(prefix, key()); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return make_pair(prefix, iters[0]->key().ToStringView()); + } bufferlist value() override { return to_bufferlist(iters[0]->value()); } @@ -3027,6 +3111,10 @@ public: rocksdb::Slice val = iters[0]->value(); return bufferptr(val.data(), val.size()); } + std::string_view value_as_sv() override { + rocksdb::Slice val = iters[0]->value(); + return std::string_view{val.data(), val.size()}; + } int status() override { return iters[0]->status().ok() ? 0 : -1; } diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 477b209854c..50b91be2bf6 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -386,10 +386,13 @@ public: int next() override; int prev() override; std::string key() override; + std::string_view key_as_sv() override; std::pair<std::string,std::string> raw_key() override; + std::pair<std::string_view,std::string_view> raw_key_as_sv() override; bool raw_key_is_prefixed(const std::string &prefix) override; ceph::bufferlist value() override; ceph::bufferptr value_as_ptr() override; + std::string_view value_as_sv() override; int status() override; size_t key_size() override; size_t value_size() override; @@ -419,6 +422,7 @@ public: } static int split_key(rocksdb::Slice in, std::string *prefix, std::string *key); + static int split_key(rocksdb::Slice in, std::string_view *prefix, std::string_view *key); static std::string past_prefix(const std::string &prefix); diff --git a/src/librados/librados_asio.h b/src/librados/librados_asio.h index 0aedc376575..3e5b7c57c6f 100644 --- a/src/librados/librados_asio.h +++ b/src/librados/librados_asio.h @@ -14,6 +14,9 @@ #ifndef LIBRADOS_ASIO_H #define LIBRADOS_ASIO_H +#include <boost/asio/associated_cancellation_slot.hpp> +#include <boost/asio/cancellation_type.hpp> + #include "include/rados/librados.hpp" #include "common/async/completion.h" #include "librados/AioCompletionImpl.h" @@ -74,6 +77,7 @@ struct Invoker<void> { template <typename Result> struct AsyncOp : Invoker<Result> { unique_aio_completion_ptr aio_completion; + boost::asio::cancellation_slot slot; using Signature = typename Invoker<Result>::Signature; using Completion = ceph::async::Completion<Signature, AsyncOp<Result>>; @@ -83,6 +87,7 @@ struct AsyncOp : Invoker<Result> { auto p = std::unique_ptr<Completion>{static_cast<Completion*>(arg)}; // move result out of Completion memory being freed auto op = std::move(p->user_data); + op.slot.clear(); // clear our cancellation handler // access AioCompletionImpl directly to avoid locking const librados::AioCompletionImpl* pc = op.aio_completion->pc; const int ret = pc->rval; @@ -94,11 +99,46 @@ struct AsyncOp : Invoker<Result> { op.dispatch(std::move(p), ec, ver); } + struct op_cancellation { + AioCompletion* completion = nullptr; + bool is_read = false; + + void operator()(boost::asio::cancellation_type type) { + if (completion == nullptr) { + return; // no AioCompletion attached + } else if (type == boost::asio::cancellation_type::none) { + return; // no cancellation requested + } else if (is_read) { + // read operations produce no side effects, so can satisfy the + // requirements of 'total' cancellation. the weaker requirements + // of 'partial' and 'terminal' are also satisfied + completion->cancel(); + } else if (type == boost::asio::cancellation_type::terminal) { + // write operations only support 'terminal' cancellation because we + // can't guarantee that no osd has succeeded (or will succeed) in + // applying the write + completion->cancel(); + } + } + }; + template <typename Executor1, typename CompletionHandler> - static auto create(const Executor1& ex1, CompletionHandler&& handler) { + static auto create(const Executor1& ex1, bool is_read, + CompletionHandler&& handler) { + op_cancellation* cancel_handler = nullptr; + auto slot = boost::asio::get_associated_cancellation_slot(handler); + if (slot.is_connected()) { + cancel_handler = &slot.template emplace<op_cancellation>(); + } + auto p = Completion::create(ex1, std::move(handler)); p->user_data.aio_completion.reset( Rados::aio_create_completion(p.get(), aio_dispatch)); + if (cancel_handler) { + cancel_handler->completion = p->user_data.aio_completion.get(); + cancel_handler->is_read = is_read; + p->user_data.slot = std::move(slot); + } return p; } }; @@ -108,6 +148,9 @@ struct AsyncOp : Invoker<Result> { /// Calls IoCtx::aio_read() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t, bufferlist). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid, size_t len, uint64_t off, CompletionToken&& token) @@ -117,7 +160,8 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, size_t len, uint64_t off) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = true; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_read(oid, op.aio_completion.get(), &op.result, len, off); @@ -132,6 +176,9 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_write() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid, const bufferlist &bl, size_t len, uint64_t off, @@ -142,7 +189,8 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, const bufferlist &bl, size_t len, uint64_t off) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = false; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_write(oid, op.aio_completion.get(), bl, len, off); @@ -157,6 +205,9 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t, bufferlist). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, ObjectReadOperation *read_op, int flags, @@ -167,7 +218,8 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, ObjectReadOperation *read_op, int flags) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = true; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_operate(oid, op.aio_completion.get(), read_op, @@ -183,6 +235,9 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, ObjectWriteOperation *write_op, int flags, @@ -194,7 +249,8 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, [] (auto handler, auto ex, IoCtx& io, const std::string& oid, ObjectWriteOperation *write_op, int flags, const jspan_context* trace_ctx) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = false; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_operate(oid, op.aio_completion.get(), write_op, flags, trace_ctx); @@ -209,6 +265,9 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_notify() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t, bufferlist). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid, bufferlist& bl, uint64_t timeout_ms, CompletionToken &&token) @@ -218,7 +277,8 @@ auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, bufferlist& bl, uint64_t timeout_ms) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = false; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_notify(oid, op.aio_completion.get(), diff --git a/src/librados/librados_cxx.cc b/src/librados/librados_cxx.cc index 2167eeade3c..60217b99b41 100644 --- a/src/librados/librados_cxx.cc +++ b/src/librados/librados_cxx.cc @@ -1103,6 +1103,14 @@ void librados::AioCompletion::release() delete this; } +int librados::AioCompletion::cancel() +{ + if (!pc->io) { + return 0; // no operation was started + } + return pc->io->aio_cancel(pc); +} + ///////////////////////////// IoCtx ////////////////////////////// librados::IoCtx::IoCtx() : io_ctx_impl(NULL) { diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc index 65e3fc4a4c2..160bb4dcf9e 100644 --- a/src/librbd/ObjectMap.cc +++ b/src/librbd/ObjectMap.cc @@ -107,32 +107,6 @@ bool ObjectMap<I>::object_may_exist(uint64_t object_no) const } template <typename I> -bool ObjectMap<I>::object_may_not_exist(uint64_t object_no) const -{ - ceph_assert(ceph_mutex_is_locked(m_image_ctx.image_lock)); - - // Fall back to default logic if object map is disabled or invalid - if (!m_image_ctx.test_features(RBD_FEATURE_OBJECT_MAP, - m_image_ctx.image_lock)) { - return true; - } - - bool flags_set; - int r = m_image_ctx.test_flags(m_image_ctx.snap_id, - RBD_FLAG_OBJECT_MAP_INVALID, - m_image_ctx.image_lock, &flags_set); - if (r < 0 || flags_set) { - return true; - } - - uint8_t state = (*this)[object_no]; - bool nonexistent = (state != OBJECT_EXISTS && state != OBJECT_EXISTS_CLEAN); - ldout(m_image_ctx.cct, 20) << "object_no=" << object_no << " r=" - << nonexistent << dendl; - return nonexistent; -} - -template <typename I> bool ObjectMap<I>::update_required(const ceph::BitVector<2>::Iterator& it, uint8_t new_state) { ceph_assert(ceph_mutex_is_locked(m_lock)); diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h index 35ea4cb88f9..5e7fcbbe9dd 100644 --- a/src/librbd/ObjectMap.h +++ b/src/librbd/ObjectMap.h @@ -65,7 +65,6 @@ public: void close(Context *on_finish); bool set_object_map(ceph::BitVector<2> &target_object_map); bool object_may_exist(uint64_t object_no) const; - bool object_may_not_exist(uint64_t object_no) const; void aio_save(Context *on_finish); void aio_resize(uint64_t new_size, uint8_t default_object_state, diff --git a/src/librbd/migration/HttpClient.cc b/src/librbd/migration/HttpClient.cc index 09fe91da02a..d212981a917 100644 --- a/src/librbd/migration/HttpClient.cc +++ b/src/librbd/migration/HttpClient.cc @@ -193,7 +193,7 @@ protected: ldout(cct, 15) << dendl; boost::system::error_code ec; - boost::beast::get_lowest_layer(derived().stream()).socket().close(ec); + derived().stream().lowest_layer().close(ec); } private: @@ -357,8 +357,7 @@ private: } int shutdown_socket() { - if (!boost::beast::get_lowest_layer( - derived().stream()).socket().is_open()) { + if (!derived().stream().lowest_layer().is_open()) { return 0; } @@ -366,7 +365,7 @@ private: ldout(cct, 15) << dendl; boost::system::error_code ec; - boost::beast::get_lowest_layer(derived().stream()).socket().shutdown( + derived().stream().lowest_layer().shutdown( boost::asio::ip::tcp::socket::shutdown_both, ec); if (ec && ec != boost::beast::errc::not_connected) { @@ -595,7 +594,7 @@ public: this->close_socket(); } - inline boost::beast::tcp_stream& + inline boost::asio::ip::tcp::socket& stream() { return m_stream; } @@ -607,12 +606,13 @@ protected: auto cct = http_client->m_cct; ldout(cct, 15) << dendl; - ceph_assert(!m_stream.socket().is_open()); - m_stream.async_connect( - results, - [on_finish](boost::system::error_code ec, const auto& endpoint) { - on_finish->complete(-ec.value()); - }); + ceph_assert(!m_stream.is_open()); + boost::asio::async_connect(m_stream, + results, + [on_finish](boost::system::error_code ec, + const auto& endpoint) { + on_finish->complete(-ec.value()); + }); } void disconnect(Context* on_finish) override { @@ -624,7 +624,7 @@ protected: } private: - boost::beast::tcp_stream m_stream; + boost::asio::ip::tcp::socket m_stream; }; #undef dout_prefix @@ -643,7 +643,7 @@ public: this->close_socket(); } - inline boost::beast::ssl_stream<boost::beast::tcp_stream>& + inline boost::asio::ssl::stream<boost::asio::ip::tcp::socket>& stream() { return m_stream; } @@ -655,8 +655,9 @@ protected: auto cct = http_client->m_cct; ldout(cct, 15) << dendl; - ceph_assert(!boost::beast::get_lowest_layer(m_stream).socket().is_open()); - boost::beast::get_lowest_layer(m_stream).async_connect( + ceph_assert(!m_stream.lowest_layer().is_open()); + async_connect( + m_stream.lowest_layer(), results, [this, on_finish](boost::system::error_code ec, const auto& endpoint) { handle_connect(-ec.value(), on_finish); @@ -681,12 +682,12 @@ protected: // ssl_stream object can't be reused after shut down -- move-in // a freshly constructed instance - m_stream = boost::beast::ssl_stream<boost::beast::tcp_stream>( + m_stream = boost::asio::ssl::stream<boost::asio::ip::tcp::socket>( http_client->m_strand, http_client->m_ssl_context); } private: - boost::beast::ssl_stream<boost::beast::tcp_stream> m_stream; + boost::asio::ssl::stream<boost::asio::ip::tcp::socket> m_stream; void handle_connect(int r, Context* on_finish) { auto http_client = this->m_http_client; diff --git a/src/librbd/migration/HttpClient.h b/src/librbd/migration/HttpClient.h index 3997e6159e7..5844f918693 100644 --- a/src/librbd/migration/HttpClient.h +++ b/src/librbd/migration/HttpClient.h @@ -13,13 +13,12 @@ #include <boost/asio/strand.hpp> #include <boost/asio/ip/tcp.hpp> #include <boost/asio/ssl/context.hpp> +#include <boost/asio/ssl/stream.hpp> #include <boost/beast/version.hpp> -#include <boost/beast/core/tcp_stream.hpp> #include <boost/beast/http/empty_body.hpp> #include <boost/beast/http/message.hpp> #include <boost/beast/http/string_body.hpp> #include <boost/beast/http/write.hpp> -#include <boost/beast/ssl/ssl_stream.hpp> #include <functional> #include <memory> #include <string> @@ -97,7 +96,7 @@ public: completion(r, std::move(response)); } - void operator()(boost::beast::tcp_stream& stream) override { + void operator()(boost::asio::ip::tcp::socket& stream) override { preprocess_request(); boost::beast::http::async_write( @@ -110,7 +109,7 @@ public: } void operator()( - boost::beast::ssl_stream<boost::beast::tcp_stream>& stream) override { + boost::asio::ssl::stream<boost::asio::ip::tcp::socket>& stream) override { preprocess_request(); boost::beast::http::async_write( @@ -152,9 +151,9 @@ private: virtual bool need_eof() const = 0; virtual bool header_only() const = 0; virtual void complete(int r, Response&&) = 0; - virtual void operator()(boost::beast::tcp_stream& stream) = 0; + virtual void operator()(boost::asio::ip::tcp::socket& stream) = 0; virtual void operator()( - boost::beast::ssl_stream<boost::beast::tcp_stream>& stream) = 0; + boost::asio::ssl::stream<boost::asio::ip::tcp::socket>& stream) = 0; }; template <typename D> struct HttpSession; diff --git a/src/librbd/operation/FlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc index 7bc34681924..8034637e8e6 100644 --- a/src/librbd/operation/FlattenRequest.cc +++ b/src/librbd/operation/FlattenRequest.cc @@ -49,15 +49,6 @@ public: return -ERESTART; } - { - std::shared_lock image_lock{image_ctx.image_lock}; - if (image_ctx.object_map != nullptr && - !image_ctx.object_map->object_may_not_exist(m_object_no)) { - // can skip because the object already exists - return 1; - } - } - if (!io::util::trigger_copyup( &image_ctx, m_object_no, m_io_context, this)) { // stop early if the parent went away - it just means diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 6fbfc79d416..1c1eeb4ecf8 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -26,6 +26,7 @@ #include "mds/MDSRank.h" #include "mds/MDSMap.h" #include "mds/Locker.h" +#include "mds/mdstypes.h" #include "Beacon.h" @@ -550,6 +551,19 @@ void Beacon::notify_health(MDSRank const *mds) } } } + if (mds->is_replay()) { + CachedStackStringStream css; + auto estimate = mds->mdlog->get_estimated_replay_finish_time(); + // this probably should be configurable, however, its fine to report + // if replay is running for more than 30 seconds. + if (estimate.elapsed_time > std::chrono::seconds(30)) { + *css << "replay: " << estimate.percent_complete << "% complete - elapsed time: " + << estimate.elapsed_time << ", estimated time remaining: " + << estimate.estimated_time; + MDSHealthMetric m(MDS_HEALTH_ESTIMATED_REPLAY_TIME, HEALTH_WARN, css->strv()); + health.metrics.push_back(m); + } + } } MDSMap::DaemonState Beacon::get_want_state() const diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 4bbf2a1a141..d041e3b2fc8 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -221,7 +221,46 @@ uint64_t MDLog::get_safe_pos() const return journaler->get_write_safe_pos(); } +// estimate the replay completion time based on mdlog journal pointers +EstimatedReplayTime MDLog::get_estimated_replay_finish_time() { + ceph_assert(mds->is_replay()); + EstimatedReplayTime estimated_time{0, std::chrono::seconds::zero(), std::chrono::seconds::zero()}; + if (!journaler) { + return estimated_time; + } + + auto read_pos = journaler->get_read_pos(); + auto write_pos = journaler->get_write_pos(); + auto trimmed_pos = journaler->get_trimmed_pos(); + + dout(20) << __func__ << ": read_pos=" << read_pos << ", write_pos=" + << write_pos << ", trimmed_pos=" << trimmed_pos << dendl; + + if (read_pos == trimmed_pos || write_pos == trimmed_pos) { + return estimated_time; + } + + auto total_bytes = write_pos - trimmed_pos; + double percent_complete = ((double)(read_pos - trimmed_pos)) / (double)total_bytes; + auto elapsed_time = std::chrono::duration_cast<std::chrono::seconds> + (ceph::coarse_mono_clock::now() - replay_start_time); + auto time = ((1 - percent_complete) / percent_complete) * elapsed_time; + + dout(20) << __func__ << "percent_complete=" << percent_complete + << ", elapsed_time=" << elapsed_time + << ", estimated_time=" << std::chrono::round<std::chrono::seconds>(time) + << dendl; + + estimated_time.percent_complete = percent_complete * 100; + estimated_time.elapsed_time = elapsed_time; + estimated_time.estimated_time = std::chrono::round<std::chrono::seconds>(time); + dout(20) << __func__ << "estimated_time.percent_complete=" << estimated_time.percent_complete + << ", estimated_time.elapsed_time=" << estimated_time.elapsed_time + << ", estimated_time.estimated_time=" << estimated_time.estimated_time + << dendl; + return estimated_time; +} void MDLog::create(MDSContext *c) { @@ -1137,6 +1176,7 @@ void MDLog::_recovery_thread(MDSContext *completion) { std::lock_guard l(mds->mds_lock); journaler = front_journal; + replay_start_time = ceph::coarse_mono_clock::now(); } C_SaferCond recover_wait; @@ -1374,11 +1414,17 @@ void MDLog::_reformat_journal(JournalPointer const &jp_in, Journaler *old_journa // i am a separate thread void MDLog::_replay_thread() { - dout(10) << "_replay_thread start" << dendl; + dout(10) << __func__ << ": start time: " << replay_start_time << ", now: " + << ceph::coarse_mono_clock::now() << dendl; // loop int r = 0; while (1) { + auto sleep_time = g_conf().get_val<std::chrono::milliseconds>("mds_delay_journal_replay_for_testing"); + if (unlikely(sleep_time > 0ms)) { + dout(10) << __func__ << ": sleeping for " << sleep_time << "ms" << dendl; + std::this_thread::sleep_for(sleep_time); + } // wait for read? journaler->check_isreadable(); if (journaler->get_error()) { diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index a858b40fa03..180a34c9d82 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -53,6 +53,7 @@ enum { #include "LogSegment.h" #include "MDSMap.h" #include "SegmentBoundary.h" +#include "mdstypes.h" #include <list> #include <map> @@ -162,6 +163,7 @@ public: void reopen(MDSContext *onopen); void append(); void replay(MDSContext *onfinish); + EstimatedReplayTime get_estimated_replay_finish_time(); void standby_trim_segments(); @@ -328,5 +330,7 @@ private: std::atomic<bool> upkeep_log_trim_shutdown{false}; std::map<uint64_t, std::vector<Context*>> waiting_for_expire; // protected by mds_lock + + ceph::coarse_mono_time replay_start_time = ceph::coarse_mono_clock::zero(); }; #endif diff --git a/src/mds/MetricsHandler.cc b/src/mds/MetricsHandler.cc index 9fc4c6122a4..d9c09e06b27 100644 --- a/src/mds/MetricsHandler.cc +++ b/src/mds/MetricsHandler.cc @@ -20,15 +20,6 @@ MetricsHandler::MetricsHandler(CephContext *cct, MDSRank *mds) mds(mds) { } -bool MetricsHandler::ms_can_fast_dispatch2(const cref_t<Message> &m) const { - return m->get_type() == CEPH_MSG_CLIENT_METRICS || m->get_type() == MSG_MDS_PING; -} - -void MetricsHandler::ms_fast_dispatch2(const ref_t<Message> &m) { - bool handled = ms_dispatch2(m); - ceph_assert(handled); -} - bool MetricsHandler::ms_dispatch2(const ref_t<Message> &m) { if (m->get_type() == CEPH_MSG_CLIENT_METRICS && m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_CLIENT) { diff --git a/src/mds/MetricsHandler.h b/src/mds/MetricsHandler.h index 0b75b024860..25ee208aa95 100644 --- a/src/mds/MetricsHandler.h +++ b/src/mds/MetricsHandler.h @@ -25,11 +25,6 @@ class MetricsHandler : public Dispatcher { public: MetricsHandler(CephContext *cct, MDSRank *mds); - bool ms_can_fast_dispatch_any() const override { - return true; - } - bool ms_can_fast_dispatch2(const cref_t<Message> &m) const override; - void ms_fast_dispatch2(const ref_t<Message> &m) override; bool ms_dispatch2(const ref_t<Message> &m) override; void ms_handle_connect(Connection *c) override { diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 5874a3dce56..e66b5aa08c7 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4167,7 +4167,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup) if (r < 0) { // fall-thru. let rdlock_path_pin_ref() check again. - } else if (is_lookup) { + } else if (is_lookup && mdr->dn[0].size()) { CDentry* dn = mdr->dn[0].back(); mdr->pin(dn); auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple()); @@ -4274,7 +4274,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup) // reply dout(10) << "reply to stat on " << *req << dendl; mdr->tracei = ref; - if (is_lookup) + if (is_lookup && mdr->dn[0].size()) mdr->tracedn = mdr->dn[0].back(); respond_to_request(mdr, 0); } diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc index 680218e62e3..f9424eed6dc 100644 --- a/src/mds/mdstypes.cc +++ b/src/mds/mdstypes.cc @@ -1042,3 +1042,8 @@ void snaprealm_reconnect_t::generate_test_instances(std::list<snaprealm_reconnec ls.back()->realm.seq = 2; ls.back()->realm.parent = 1; } + +void EstimatedReplayTime::print(std::ostream& out) { + out << "replay: " << percent_complete << "% complete - elapsed time: " + << elapsed_time << ", estimated time remaining: " << estimated_time; +} diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 3b8269006cb..742d7b23432 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -1044,4 +1044,12 @@ inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) } WRITE_CLASS_ENCODER(MDSCacheObjectInfo) +struct EstimatedReplayTime { + double percent_complete; + std::chrono::seconds estimated_time; + std::chrono::seconds elapsed_time; + + void print(std::ostream& out); +}; + #endif diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index c157c33e758..526285aae8c 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -48,6 +48,7 @@ enum mds_metric_t { MDS_HEALTH_CLIENTS_LAGGY, MDS_HEALTH_CLIENTS_LAGGY_MANY, MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH, + MDS_HEALTH_ESTIMATED_REPLAY_TIME, MDS_HEALTH_DUMMY, // not a real health warning, for testing }; @@ -69,6 +70,7 @@ inline const char *mds_metric_name(mds_metric_t m) case MDS_HEALTH_CLIENTS_LAGGY: return "MDS_CLIENTS_LAGGY"; case MDS_HEALTH_CLIENTS_LAGGY_MANY: return "MDS_CLIENTS_LAGGY_MANY"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "MDS_CLIENTS_BROKEN_ROOTSQUASH"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: return "MDS_ESTIMATED_REPLAY_TIME"; case MDS_HEALTH_DUMMY: return "MDS_DUMMY"; default: return "???"; @@ -107,6 +109,8 @@ inline const char *mds_metric_summary(mds_metric_t m) return "%num% client(s) laggy due to laggy OSDs"; case MDS_HEALTH_CLIENTS_BROKEN_ROOTSQUASH: return "%num% MDS report clients with broken root_squash implementation"; + case MDS_HEALTH_ESTIMATED_REPLAY_TIME: + return "%num% estimated journal replay time"; default: return "???"; } diff --git a/src/mgr/PyModule.cc b/src/mgr/PyModule.cc index cff63ef4a6b..4f996489ba0 100644 --- a/src/mgr/PyModule.cc +++ b/src/mgr/PyModule.cc @@ -38,6 +38,18 @@ std::string PyModule::mgr_store_prefix = "mgr/"; #define BOOST_BIND_GLOBAL_PLACEHOLDERS // Boost apparently can't be bothered to fix its own usage of its own // deprecated features. + +// Fix instances of "'BOOST_PP_ITERATION_02' was not declared in this scope; did +// you mean 'BOOST_PP_ITERATION_05'" and related macro error bullshit that spans +// 300 lines of errors +// +// Apparently you can't include boost/python stuff _and_ have this header +// defined +// +// Thanks to the ceph-aur folks for the fix at: +// https://github.com/bazaah/aur-ceph/commit/8c5cc7d8deec002f7596b6d0860859a0a718f12b +#undef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS + #include <boost/python/extract.hpp> #include <boost/python/import.hpp> #include <boost/python/object.hpp> diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h index 177447c2cb3..a47db3a47ef 100644 --- a/src/mgr/PyModule.h +++ b/src/mgr/PyModule.h @@ -161,9 +161,9 @@ public: } const std::string &get_name() const { - std::lock_guard l(lock) ; return module_name; + return module_name; } - const std::string &get_error_string() const { + std::string get_error_string() const { std::lock_guard l(lock) ; return error_string; } bool get_can_run() const { diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index 6220a357ff0..cc53d2869f7 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -1211,6 +1211,11 @@ class RemoveFilesystemHandler : public FileSystemCommandHandler fsmap.erase_filesystem(fsp->get_fscid()); + ss << "If there are active snapshot schedules associated with this " + << "file-system, you might see EIO errors in the mgr logs or at the " + << "snap-schedule command-line due to the missing file-system. " + << "However, these errors are transient and will get auto-resolved."; + return 0; } }; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 7332ec3edb1..833bdddc71b 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -4024,7 +4024,7 @@ void Monitor::handle_command(MonOpRequestRef op) for (auto& p : mgrstatmon()->get_service_map().services) { auto &service = p.first; - if (ServiceMap::is_normal_ceph_entity(service)) { + if (ServiceMap::is_normal_ceph_entity(service) || service == "nvmeof") { continue; } f->open_object_section(service.c_str()); diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 719403925ad..2d2735f1e7c 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -171,6 +171,8 @@ int NVMeofGwMap::cfg_delete_gw( << state.availability << " Resulting GW availability: " << state.availability << dendl; state.subsystems.clear();//ignore subsystems of this GW + utime_t now = ceph_clock_now(); + mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now; return 0; } } @@ -895,10 +897,12 @@ struct CMonRequestProposal : public Context { } }; -void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const +void NVMeofGwMap::get_health_checks(health_check_map_t *checks) { list<string> singleGatewayDetail; list<string> gatewayDownDetail; + list<string> gatewayInDeletingDetail; + int deleting_gateways = 0; for (const auto& created_map_pair: created_gws) { const auto& group_key = created_map_pair.first; auto& group = group_key.second; @@ -915,9 +919,37 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ostringstream ss; ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ; gatewayDownDetail.push_back(ss.str()); + } else if (gw_created.availability == gw_availability_t::GW_DELETING) { + deleting_gateways++; + utime_t now = ceph_clock_now(); + bool found_deleting_time = false; + auto gws_deleting_time = mon->nvmegwmon()->gws_deleting_time; + auto group_it = gws_deleting_time.find(group_key); + if (group_it != gws_deleting_time.end()) { + auto& gw_map = group_it->second; + auto gw_it = gw_map.find(gw_id); + if (gw_it != gw_map.end()) { + found_deleting_time = true; + utime_t delete_time = gw_it->second; + if ((now - delete_time) > g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_delete_grace").count()) { + ostringstream ss; + ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state."; + gatewayInDeletingDetail.push_back(ss.str()); + } + } + } + if (!found_deleting_time) { + // DELETING gateway not found in gws_deleting_time, set timeout now + mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now; + } } } } + if (deleting_gateways == 0) { + // no gateway in GW_DELETING state currently, flush old gws_deleting_time + mon->nvmegwmon()->gws_deleting_time.clear(); + } + if (!singleGatewayDetail.empty()) { ostringstream ss; ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway" @@ -934,6 +966,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ss.str(), gatewayDownDetail.size()); d.detail.swap(gatewayDownDetail); } + if (!gatewayInDeletingDetail.empty()) { + ostringstream ss; + ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state" + << "; namespaces are automatically balanced across remaining gateways, " + << "this should take a few minutes."; + auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN, + ss.str(), gatewayInDeletingDetail.size()); + d.detail.swap(gatewayInDeletingDetail); + } } int NVMeofGwMap::blocklist_gw( diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index 5f657733012..85fd62b3a07 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -144,7 +144,7 @@ public: DECODE_FINISH(bl); } - void get_health_checks(health_check_map_t *checks) const; + void get_health_checks(health_check_map_t *checks); }; #include "NVMeofGwSerialize.h" diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h index 7fae8b766a5..d7f5fd89cde 100644 --- a/src/mon/NVMeofGwMon.h +++ b/src/mon/NVMeofGwMon.h @@ -82,6 +82,8 @@ public: void check_subs(bool type); void check_sub(Subscription *sub); + std::map<NvmeGroupKey, std::map<NvmeGwId, utime_t>> gws_deleting_time; + private: void synchronize_last_beacon(); void process_gw_down(const NvmeGwId &gw_id, diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc index 2e38bd434a8..6b3a8c3f6dc 100644 --- a/src/msg/async/AsyncMessenger.cc +++ b/src/msg/async/AsyncMessenger.cc @@ -207,22 +207,22 @@ void Processor::accept() } else if (r == -EAGAIN) { break; } else if (r == -EMFILE || r == -ENFILE) { - lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd() + lderr(msgr->cct) << __func__ << " open file descriptors limit reached fd = " << listen_socket.fd() << " errno " << r << " " << cpp_strerror(r) << dendl; if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) { - lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl; + lderr(msgr->cct) << "Proccessor accept has encountered too many errors, just do ceph_abort()." << dendl; ceph_abort(); } continue; } else if (r == -ECONNABORTED) { - ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd() + ldout(msgr->cct, 0) << __func__ << " closed because of rst arrival fd = " << listen_socket.fd() << " errno " << r << " " << cpp_strerror(r) << dendl; continue; } else { lderr(msgr->cct) << __func__ << " no incoming connection?" << " errno " << r << " " << cpp_strerror(r) << dendl; if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) { - lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl; + lderr(msgr->cct) << "Proccessor accept has encountered too many errors, just do ceph_abort()." << dendl; ceph_abort(); } continue; diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc index 7ed5321dcda..eb04e3b8e98 100644 --- a/src/msg/async/EventEpoll.cc +++ b/src/msg/async/EventEpoll.cc @@ -17,6 +17,7 @@ #include "common/errno.h" #include <fcntl.h> #include "EventEpoll.h" +#include "Timeout.h" #define dout_subsys ceph_subsys_ms @@ -120,8 +121,7 @@ int EpollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct ti { int retval, numevents = 0; - retval = epoll_wait(epfd, events, nevent, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = epoll_wait(epfd, events, nevent, timeout_to_milliseconds(tvp)); if (retval > 0) { numevents = retval; fired_events.resize(numevents); diff --git a/src/msg/async/EventPoll.cc b/src/msg/async/EventPoll.cc index 4c09dbb4db4..f46528715e3 100644 --- a/src/msg/async/EventPoll.cc +++ b/src/msg/async/EventPoll.cc @@ -15,6 +15,7 @@ #include "common/errno.h" #include "EventPoll.h" +#include "Timeout.h" #include <unistd.h> #define dout_subsys ceph_subsys_ms @@ -161,11 +162,9 @@ int PollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tvp) { int retval, numevents = 0; #ifdef _WIN32 - retval = WSAPoll(pfds, max_pfds, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = WSAPoll(pfds, max_pfds, timeout_to_milliseconds(tvp)); #else - retval = poll(pfds, max_pfds, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = poll(pfds, max_pfds, timeout_to_milliseconds(tvp)); #endif if (retval > 0) { for (int j = 0; j < max_pfds; j++) { diff --git a/src/msg/async/Timeout.h b/src/msg/async/Timeout.h new file mode 100644 index 00000000000..b8df1b40761 --- /dev/null +++ b/src/msg/async/Timeout.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2024 IONOS SE + * + * Author: Max Kellermann <max.kellermann@ionos.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_TIMEOUT_H +#define CEPH_MSG_TIMEOUT_H + +#include "include/intarith.h" // for div_round_up() + +#include <time.h> // for struct timeval + +/** + * Convert the given `struct timeval` to milliseconds. + * + * This is supposed to be used as timeout parameter to system calls + * such as poll() and epoll_wait(). + */ +constexpr int +timeout_to_milliseconds(const struct timeval &tv) noexcept +{ + /* round up to the next millisecond so we don't wake up too early */ + return tv.tv_sec * 1000 + div_round_up(tv.tv_usec, 1000); +} + +/** + * This overload makes the timeout optional; on nullptr, it returns + * -1. + */ +constexpr int +timeout_to_milliseconds(const struct timeval *tv) noexcept +{ + return tv != nullptr ? timeout_to_milliseconds(*tv) : -1; +} + +#endif diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc index 7da9a67be62..65627b5f818 100644 --- a/src/os/DBObjectMap.cc +++ b/src/os/DBObjectMap.cc @@ -519,6 +519,11 @@ bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() return cur_iter->value(); } +std::string_view DBObjectMap::DBObjectMapIteratorImpl::value_as_sv() +{ + return cur_iter->value_as_sv(); +} + int DBObjectMap::DBObjectMapIteratorImpl::status() { return r; diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h index 444f21eb815..1e1452010e7 100644 --- a/src/os/DBObjectMap.h +++ b/src/os/DBObjectMap.h @@ -393,6 +393,7 @@ private: int next() override { ceph_abort(); return 0; } std::string key() override { ceph_abort(); return ""; } ceph::buffer::list value() override { ceph_abort(); return ceph::buffer::list(); } + std::string_view value_as_sv() override { ceph_abort(); return std::string_view(); } int status() override { return 0; } }; @@ -431,6 +432,7 @@ private: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; int status() override; bool on_parent() { diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 521435b6c31..df3ae920a2f 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -29,6 +29,7 @@ #include <errno.h> #include <sys/stat.h> +#include <functional> #include <map> #include <memory> #include <vector> @@ -735,15 +736,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) = 0; -#ifdef WITH_SEASTAR - virtual int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) = 0; -#endif - /// Filters keys into out which are defined on oid virtual int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -766,6 +758,48 @@ public: const ghobject_t &oid ///< [in] object ) = 0; + struct omap_iter_seek_t { + std::string seek_position; + enum { + // start with provided key (seek_position), if it exists + LOWER_BOUND, + // skip provided key (seek_position) even if it exists + UPPER_BOUND + } seek_type = LOWER_BOUND; + static omap_iter_seek_t min_lower_bound() { return {}; } + }; + enum class omap_iter_ret_t { + STOP, + NEXT + }; + /** + * Iterate over object map with user-provided callable + * + * Warning! The callable is executed under lock on bluestore + * operations in c. Do not use bluestore methods on c while + * iterating. (Filling in a transaction is no problem). + * + * @param c collection + * @param oid object + * @param start_from where the iterator should point to at + * the beginning + * @param visitor callable that takes OMAP key and corresponding + * value as string_views and controls iteration + * by the return. It is executed for every object's + * OMAP entry from `start_from` till end of the + * object's OMAP or till the iteration is stopped + * by `STOP`. Please note that if there is no such + * entry, `visitor` will be called 0 times. + * @return error code, zero on success + */ + virtual int omap_iterate( + CollectionHandle &c, + const ghobject_t &oid, + omap_iter_seek_t start_from, + std::function<omap_iter_ret_t(std::string_view, + std::string_view)> visitor + ) = 0; + virtual int flush_journal() { return -EOPNOTSUPP; } virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; } diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index e123a0a200a..50f293d45fd 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1706,7 +1706,8 @@ int BlueFS::_replay(bool noop, bool to_stdout) << " fnode=" << fnode << " delta=" << delta << dendl; - ceph_assert(delta.offset == fnode.allocated); + // be leanient, if there is no extents just produce error message + ceph_assert(delta.offset == fnode.allocated || delta.extents.empty()); } if (cct->_conf->bluefs_log_replay_check_allocations) { int r = _check_allocations(fnode, @@ -3793,7 +3794,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ if (offset > fnode.size) { ceph_abort_msg("truncate up not supported"); } - ceph_assert(offset <= fnode.size); + _flush_bdev(h); { std::lock_guard ll(log.lock); @@ -3802,43 +3803,42 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ vselector->sub_usage(h->file->vselector_hint, fnode); uint64_t x_off = 0; auto p = fnode.seek(offset, &x_off); - uint64_t cut_off = - (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]); - uint64_t new_allocated; - if (0 == cut_off) { - // whole pextent to remove - changed_extents = true; - new_allocated = offset; - } else if (cut_off < p->length) { - dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off); - new_allocated = (offset - x_off) + cut_off; - p->length = cut_off; - changed_extents = true; - ++p; - } else { - ceph_assert(cut_off >= p->length); - new_allocated = (offset - x_off) + p->length; - // just leave it here - ++p; - } - while (p != fnode.extents.end()) { - dirty.pending_release[p->bdev].insert(p->offset, p->length); - p = fnode.extents.erase(p); - changed_extents = true; + if (p != fnode.extents.end()) { + uint64_t cut_off = p2roundup(x_off, alloc_size[p->bdev]); + if (0 == cut_off) { + // whole pextent to remove + fnode.allocated = offset; + changed_extents = true; + } else if (cut_off < p->length) { + dirty.pending_release[p->bdev].insert(p->offset + cut_off, + p->length - cut_off); + fnode.allocated = (offset - x_off) + cut_off; + p->length = cut_off; + changed_extents = true; + ++p; + } else { + // cut_off > p->length means that we misaligned the extent + ceph_assert(cut_off == p->length); + fnode.allocated = (offset - x_off) + p->length; + ++p; // leave extent untouched + } + while (p != fnode.extents.end()) { + dirty.pending_release[p->bdev].insert(p->offset, p->length); + p = fnode.extents.erase(p); + changed_extents = true; + } } if (changed_extents) { fnode.size = offset; - fnode.allocated = new_allocated; fnode.reset_delta(); + fnode.recalc_allocated(); log.t.op_file_update(fnode); // sad, but is_dirty must be set to signal flushing of the log h->file->is_dirty = true; - } else { - if (offset != fnode.size) { - fnode.size = offset; - //skipping log.t.op_file_update_inc, it will be done by flush() - h->file->is_dirty = true; - } + } else if (offset != fnode.size) { + fnode.size = offset; + // skipping log.t.op_file_update_inc, it will be done by flush() + h->file->is_dirty = true; } vselector->add_usage(h->file->vselector_hint, fnode); } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index a024a0c2105..8f1d995fa8d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4830,7 +4830,7 @@ void BlueStore::Onode::rewrite_omap_key(const string& old, string *out) out->append(old.c_str() + out->length(), old.size() - out->length()); } -void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +size_t BlueStore::Onode::calc_userkey_offset_in_omap_key() const { size_t pos = sizeof(uint64_t) + 1; if (!onode.is_pgmeta_omap()) { @@ -4840,9 +4840,15 @@ void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) pos += sizeof(uint64_t); } } - *user_key = key.substr(pos); + return pos; } +void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +{ + *user_key = key.substr(calc_userkey_offset_in_omap_key()); +} + + void BlueStore::Onode::finish_write(TransContext* txc, uint32_t offset, uint32_t length) { while (true) { @@ -5519,7 +5525,13 @@ BlueStore::OmapIteratorImpl::OmapIteratorImpl( if (o->onode.has_omap()) { o->get_omap_key(string(), &head); o->get_omap_tail(&tail); + auto start1 = mono_clock::now(); it->lower_bound(head); + c->store->log_latency( + __func__, + l_bluestore_omap_seek_to_first_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); } } BlueStore::OmapIteratorImpl::~OmapIteratorImpl() @@ -5654,6 +5666,13 @@ bufferlist BlueStore::OmapIteratorImpl::value() return it->value(); } +std::string_view BlueStore::OmapIteratorImpl::value_as_sv() +{ + std::shared_lock l(c->lock); + ceph_assert(it->valid()); + return it->value_as_sv(); +} + // ===================================== @@ -6911,8 +6930,19 @@ int BlueStore::_check_main_bdev_label() return -EIO; } if (bluestore_bdev_label_require_all && r != 0) { - derr << __func__ << " not all labels read properly" << dendl; - return -EIO; + // We are about to complain that some labels failed. + // But in case if we expanded block device some labels will not be good. + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + uint32_t valid_locations = 0; + for (uint64_t loc : bdev_label_positions) { + if (loc + lsize <= bdev_label.size) { + ++valid_locations; + } + } + if (valid_locations != bdev_label_valid_locations.size()) { + derr << __func__ << " not all labels read properly" << dendl; + return -EIO; + } } return 0; } @@ -8948,11 +8978,25 @@ int BlueStore::expand_devices(ostream& out) _close_db_and_around(); // mount in read/write to sync expansion changes + if (bdev_label_multi) { + // We need not do fsck, because we can be broken - size is increased, + // but we might not have labels set. + cct->_conf.set_val_or_die("bluestore_fsck_on_mount", "false"); + } r = _mount(); ceph_assert(r == 0); if (fm && fm->is_null_manager()) { // we grow the allocation range, must reflect it in the allocation file alloc->init_add_free(size0, size - size0); + if (bdev_label_multi) { + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + for (uint64_t loc : bdev_label_positions) { + if ((loc >= size0) && (loc + lsize <= size)) { + bdev_label_valid_locations.push_back(loc); + } + } + _write_bdev_label(cct, bdev, path + "/block", bdev_label, bdev_label_valid_locations); + } need_to_destage_allocation_file = true; } umount(); @@ -13601,52 +13645,6 @@ int BlueStore::omap_get_values( return r; } -#ifdef WITH_SEASTAR -int BlueStore::omap_get_values( - CollectionHandle &c_, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<string> &start_after, ///< [in] Keys to get - map<string, bufferlist> *output ///< [out] Returned keys and values - ) -{ - Collection *c = static_cast<Collection *>(c_.get()); - dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; - if (!c->exists) - return -ENOENT; - std::shared_lock l(c->lock); - int r = 0; - OnodeRef o = c->get_onode(oid, false); - if (!o || !o->exists) { - r = -ENOENT; - goto out; - } - if (!o->onode.has_omap()) { - goto out; - } - o->flush(); - { - ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid); - if (!iter) { - r = -ENOENT; - goto out; - } - if (start_after) { - iter->upper_bound(*start_after); - } else { - iter->seek_to_first(); - } - for (; iter->valid(); iter->next()) { - output->insert(make_pair(iter->key(), iter->value())); - } - } - -out: - dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r - << dendl; - return r; -} -#endif - int BlueStore::omap_check_keys( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -13724,6 +13722,94 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it)); } +int BlueStore::omap_iterate( + CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + if (!c->exists) { + return -ENOENT; + } + std::shared_lock l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return -ENOENT; + } + o->flush(); + dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl; + if (!o->onode.has_omap()) { + // nothing to do + return 0; + } + + KeyValueDB::Iterator it; + { + auto bounds = KeyValueDB::IteratorBounds(); + std::string lower_bound, upper_bound; + o->get_omap_key(string(), &lower_bound); + o->get_omap_tail(&upper_bound); + bounds.lower_bound = std::move(lower_bound); + bounds.upper_bound = std::move(upper_bound); + it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds)); + } + + // seek the iterator + { + std::string key; + o->get_omap_key(start_from.seek_position, &key); + auto start = ceph::mono_clock::now(); + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it->lower_bound(key); + c->store->log_latency( + __func__, + l_bluestore_omap_lower_bound_lat, + ceph::mono_clock::now() - start, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + } else { + it->upper_bound(key); + c->store->log_latency( + __func__, + l_bluestore_omap_upper_bound_lat, + ceph::mono_clock::now() - start, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + } + } + + // iterate! + std::string tail; + o->get_omap_tail(&tail); + const std::string_view::size_type userkey_offset_in_dbkey = + o->calc_userkey_offset_in_omap_key(); + ceph::timespan next_lat_acc{0}; + while (it->valid()) { + const auto& db_key = it->raw_key_as_sv().second; + if (db_key >= tail) { + break; + } + std::string_view user_key = db_key.substr(userkey_offset_in_dbkey); + omap_iter_ret_t ret = f(user_key, it->value_as_sv()); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + ceph::time_guard<ceph::mono_clock>{next_lat_acc}; + it->next(); + } else { + ceph_abort(); + } + } + c->store->log_latency( + __func__, + l_bluestore_omap_next_lat, + next_lat_acc, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + return 0; +} + // ----------------- // write helpers diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 99f8d057cf0..5549f97ffea 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1457,6 +1457,7 @@ public: } void rewrite_omap_key(const std::string& old, std::string *out); + size_t calc_userkey_offset_in_omap_key() const; void decode_omap_key(const std::string& key, std::string *user_key); void finish_write(TransContext* txc, uint32_t offset, uint32_t length); @@ -1753,6 +1754,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; std::string tail_key() override { return tail; } @@ -3416,15 +3418,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif - /// Filters keys into out which are defined on oid int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -3438,6 +3431,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index e18dd490140..fe77f7f74d8 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -154,7 +154,9 @@ mempool::bluefs::vector<bluefs_extent_t>::iterator bluefs_fnode_t::seek( assert(it != extents_index.begin()); --it; assert(offset >= *it); - p += it - extents_index.begin(); + uint32_t skip = it - extents_index.begin(); + ceph_assert(skip <= extents.size()); + p += skip; offset -= *it; } diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 627118c12f8..08b3ca0cf41 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -89,6 +89,7 @@ struct bluefs_fnode_t { void recalc_allocated() { allocated = 0; extents_index.reserve(extents.size()); + extents_index.clear(); for (auto& p : extents) { extents_index.emplace_back(allocated); allocated += p.length; diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc index 7158486ca38..a069d429155 100644 --- a/src/os/kstore/KStore.cc +++ b/src/os/kstore/KStore.cc @@ -1651,6 +1651,13 @@ bufferlist KStore::OmapIteratorImpl::value() return it->value(); } +std::string_view KStore::OmapIteratorImpl::value_as_sv() +{ + std::shared_lock l{c->lock}; + ceph_assert(it->valid()); + return it->value_as_sv(); +} + int KStore::omap_get( CollectionHandle& ch, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -1866,6 +1873,71 @@ ObjectMap::ObjectMapIterator KStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); } +int KStore::omap_iterate( + CollectionHandle &ch, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + { + std::shared_lock l{c->lock}; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return -ENOENT; + } + o->flush(); + dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl; + + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + std::string tail; + std::string seek_key; + if (o->onode.omap_head) { + return 0; // nothing to do + } + + // acquire data depedencies for seek & iterate + get_omap_key(o->onode.omap_head, start_from.seek_position, &seek_key); + get_omap_tail(o->onode.omap_head, &tail); + + // acquire the iterator + { + it = db->get_iterator(PREFIX_OMAP); + } + + // seek the iterator + { + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it->lower_bound(seek_key); + } else { + it->upper_bound(seek_key); + } + } + + // iterate! + while (it->valid()) { + std::string user_key; + if (const auto& db_key = it->raw_key().second; db_key >= tail) { + break; + } else { + decode_omap_key(db_key, &user_key); + } + omap_iter_ret_t ret = f(user_key, it->value_as_sv()); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + it->next(); + } else { + ceph_abort(); + } + } + } + return 0; +} + // ----------------- // write helpers diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h index 9a9d413c66a..06115d3cab7 100644 --- a/src/os/kstore/KStore.h +++ b/src/os/kstore/KStore.h @@ -180,6 +180,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; int status() override { return 0; } @@ -553,6 +554,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc index 89cb09361cf..f9d3bf0d8a2 100644 --- a/src/os/memstore/MemStore.cc +++ b/src/os/memstore/MemStore.cc @@ -537,30 +537,6 @@ int MemStore::omap_get_values( return 0; } -#ifdef WITH_SEASTAR -int MemStore::omap_get_values( - CollectionHandle& ch, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) -{ - dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; - Collection *c = static_cast<Collection*>(ch.get()); - ObjectRef o = c->get_object(oid); - if (!o) - return -ENOENT; - assert(start_after); - std::lock_guard lock{o->omap_mutex}; - for (auto it = o->omap.upper_bound(*start_after); - it != std::end(o->omap); - ++it) { - out->insert(*it); - } - return 0; -} -#endif - int MemStore::omap_check_keys( CollectionHandle& ch, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -622,6 +598,10 @@ public: std::lock_guard lock{o->omap_mutex}; return it->second; } + std::string_view value_as_sv() override { + std::lock_guard lock{o->omap_mutex}; + return std::string_view{it->second.c_str(), it->second.length()}; + } int status() override { return 0; } @@ -639,6 +619,48 @@ ObjectMap::ObjectMapIterator MemStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o)); } +int MemStore::omap_iterate( + CollectionHandle &ch, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f) +{ + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + + { + std::lock_guard lock{o->omap_mutex}; + + // obtain seek the iterator + decltype(o->omap)::iterator it; + { + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it = o->omap.lower_bound(start_from.seek_position); + } else { + it = o->omap.upper_bound(start_from.seek_position); + } + } + + // iterate! + while (it != o->omap.end()) { + // potentially rectifying memcpy but who cares for memstore? + omap_iter_ret_t ret = + f(it->first, std::string_view{it->second.c_str(), it->second.length()}); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + ++it; + } else { + ceph_abort(); + } + } + } + return 0; +} + // --------------- // write operations diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h index 2abe552891f..9621773598f 100644 --- a/src/os/memstore/MemStore.h +++ b/src/os/memstore/MemStore.h @@ -363,14 +363,6 @@ public: const std::set<std::string> &keys, ///< [in] Keys to get std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif using ObjectStore::omap_check_keys; /// Filters keys into out which are defined on oid @@ -387,6 +379,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override; uuid_d get_fsid() override; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index fa2570aba42..8630b038812 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -945,6 +945,10 @@ void ECBackend::handle_sub_write( } trace.event("handle_sub_write"); + if (cct->_conf->bluestore_debug_inject_read_err && + ec_inject_test_write_error3(op.soid)) { + ceph_abort_msg("Error inject - OSD down"); + } if (!get_parent()->pgb_is_primary()) get_parent()->update_stats(op.stats); ObjectStore::Transaction localt; @@ -1191,6 +1195,15 @@ void ECBackend::handle_sub_write_reply( i->second->on_all_commit = 0; i->second->trace.event("ec write all committed"); } + if (cct->_conf->bluestore_debug_inject_read_err && + (i->second->pending_commit.size() == 1) && + ec_inject_test_write_error2(i->second->hoid)) { + std::string cmd = + "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( get_parent()->whoami() ) + "\"] }"; + vector<std::string> vcmd{cmd}; + dout(0) << __func__ << " Error inject - marking OSD down" << dendl; + get_parent()->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr); + } rmw_pipeline.check_ops(); } @@ -1208,6 +1221,19 @@ void ECBackend::handle_sub_read_reply( return; } ReadOp &rop = iter->second; + if (cct->_conf->bluestore_debug_inject_read_err) { + for (auto i = op.buffers_read.begin(); + i != op.buffers_read.end(); + ++i) { + if (ec_inject_test_read_error0(ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) { + dout(0) << __func__ << " Error inject - EIO error for shard " << op.from.shard << dendl; + op.buffers_read.erase(i->first); + op.attrs_read.erase(i->first); + op.errors[i->first] = -EIO; + } + + } + } for (auto i = op.buffers_read.begin(); i != op.buffers_read.end(); ++i) { diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 609ac3141ae..59077547fcb 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -226,8 +226,14 @@ void ECCommon::ReadPipeline::get_all_avail_shards( ++i) { dout(10) << __func__ << ": checking acting " << *i << dendl; const pg_missing_t &missing = get_parent()->get_shard_missing(*i); - if (error_shards.find(*i) != error_shards.end()) + if (error_shards.contains(*i)) { continue; + } + if (cct->_conf->bluestore_debug_inject_read_err && + ec_inject_test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, i->shard))) { + dout(0) << __func__ << " Error inject - Missing shard " << i->shard << dendl; + continue; + } if (!missing.is_missing(hoid)) { ceph_assert(!have.count(i->shard)); have.insert(i->shard); @@ -912,6 +918,11 @@ bool ECCommon::RMWPipeline::try_reads_to_commit() if (*i == get_parent()->whoami_shard()) { should_write_local = true; local_write_op.claim(sop); + } else if (cct->_conf->bluestore_debug_inject_read_err && + ec_inject_test_write_error1(ghobject_t(op->hoid, + ghobject_t::NO_GEN, i->shard))) { + dout(0) << " Error inject - Dropping write message to shard " << + i->shard << dendl; } else { MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop); r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard); @@ -1090,3 +1101,305 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info( } return ref; } + +// Error inject interfaces +static ceph::recursive_mutex ec_inject_lock = + ceph::make_recursive_mutex("ECCommon::ec_inject_lock"); +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_read_failures0; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_read_failures1; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures0; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures1; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures2; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures3; +static std::map<ghobject_t,shard_id_t> ec_inject_write_failures0_shard; +static std::set<osd_reqid_t> ec_inject_write_failures0_reqid; + +/** + * Configure a read error inject that typically forces additional reads of + * shards in an EC pool to recover data using the redundancy. With multiple + * errors it is possible to force client reads to fail. + * + * Type 0 - Simulate a medium error. Fail a read with -EIO to force + * additional reads and a decode + * + * Type 1 - Simulate a missing OSD. Dont even try to read a shard + * + * @brief Set up a read error inject for an object in an EC pool. + * @param o Target object for the error inject. + * @param when Error inject starts after this many object store reads. + * @param duration Error inject affects this many object store reads. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Result of configuring the error inject. + */ +std::string ec_inject_read_error(const ghobject_t& o, + const int64_t type, + const int64_t when, + const int64_t duration) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + ghobject_t os = o; + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + switch (type) { + case 0: + ec_inject_read_failures0[os] = std::pair(when, duration); + return "ok - read returns EIO"; + case 1: + ec_inject_read_failures1[os] = std::pair(when, duration); + return "ok - read pretends shard is missing"; + default: + break; + } + return "unrecognized error inject type"; +} + +/** + * Configure a write error inject that either fails an OSD or causes a + * client write operation to be rolled back. + * + * Type 0 - Tests rollback. Drop a write I/O to a shard, then simulate an OSD + * down to force rollback to occur, lastly fail the retried write from the + * client so the results of the rollback can be inspected. + * + * Type 1 - Drop a write I/O to a shard. Used on its own this will hang a + * write I/O. + * + * Type 2 - Simulate an OSD down (ceph osd down) to force a new epoch. Usually + * used together with type 1 to force a rollback + * + * Type 3 - Abort when an OSD processes a write I/O to a shard. Typically the + * client write will be commited while the OSD is absent which will result in + * recovery or backfill later when the OSD returns. + * + * @brief Set up a write error inject for an object in an EC pool. + * @param o Target object for the error inject. + * @param when Error inject starts after this many object store reads. + * @param duration Error inject affects this many object store reads. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Result of configuring the error inect. + */ +std::string ec_inject_write_error(const ghobject_t& o, + const int64_t type, + const int64_t when, + const int64_t duration) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures; + ghobject_t os = o; + bool no_shard = true; + std::string result; + switch (type) { + case 0: + failures = &ec_inject_write_failures0; + result = "ok - drop write, sim OSD down and fail client retry with EINVAL"; + break; + case 1: + failures = &ec_inject_write_failures1; + no_shard = false; + result = "ok - drop write to shard"; + break; + case 2: + failures = &ec_inject_write_failures2; + result = "ok - inject OSD down"; + break; + case 3: + if (duration != 1) { + return "duration must be 1"; + } + failures = &ec_inject_write_failures3; + result = "ok - write abort OSDs"; + break; + default: + return "unrecognized error inject type"; + } + if (no_shard) { + os.set_shard(shard_id_t::NO_SHARD); + } + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + (*failures)[os] = std::pair(when, duration); + if (type == 0) { + ec_inject_write_failures0_shard[os] = o.shard_id; + } + return result; +} + +/** + * @brief Clear a previously configured read error inject. + * @param o Target object for the error inject. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Indication of how many errors were cleared. + */ +std::string ec_inject_clear_read_error(const ghobject_t& o, + const int64_t type) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures; + ghobject_t os = o; + int64_t remaining = 0; + switch (type) { + case 0: + failures = &ec_inject_read_failures0; + break; + case 1: + failures = &ec_inject_read_failures1; + break; + default: + return "unrecognized error inject type"; + } + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + auto it = failures->find(os); + if (it != failures->end()) { + remaining = it->second.second; + failures->erase(it); + } + if (remaining == 0) { + return "no outstanding error injects"; + } else if (remaining == 1) { + return "ok - 1 inject cleared"; + } + return "ok - " + std::to_string(remaining) + " injects cleared"; +} + +/** + * @brief Clear a previously configured write error inject. + * @param o Target object for the error inject. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Indication of how many errors were cleared. + */ +std::string ec_inject_clear_write_error(const ghobject_t& o, + const int64_t type) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures; + ghobject_t os = o; + bool no_shard = true; + int64_t remaining = 0; + switch (type) { + case 0: + failures = &ec_inject_write_failures0; + break; + case 1: + failures = &ec_inject_write_failures1; + no_shard = false; + break; + case 2: + failures = &ec_inject_write_failures2; + break; + case 3: + failures = &ec_inject_write_failures3; + break; + default: + return "unrecognized error inject type"; + } + if (no_shard) { + os.set_shard(shard_id_t::NO_SHARD); + } + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + auto it = failures->find(os); + if (it != failures->end()) { + remaining = it->second.second; + failures->erase(it); + if (type == 0) { + ec_inject_write_failures0_shard.erase(os); + } + } + if (remaining == 0) { + return "no outstanding error injects"; + } else if (remaining == 1) { + return "ok - 1 inject cleared"; + } + return "ok - " + std::to_string(remaining) + " injects cleared"; +} + +static bool ec_inject_test_error(const ghobject_t& o, + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures) +{ + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + auto it = failures->find(o); + if (it == failures->end()) { + ghobject_t os = o; + os.hobj.oid.name = "*"; + os.hobj.set_hash(0); + it = failures->find(os); + } + if (it != failures->end()) { + auto && [when,duration] = it->second; + if (when > 0) { + when--; + return false; + } + if (--duration <= 0) { + failures->erase(it); + } + return true; + } + return false; +} + +bool ec_inject_test_read_error0(const ghobject_t& o) +{ + return ec_inject_test_error(o, &ec_inject_read_failures0); +} + +bool ec_inject_test_read_error1(const ghobject_t& o) +{ + return ec_inject_test_error(o, &ec_inject_read_failures1); +} + +bool ec_inject_test_write_error0(const hobject_t& o, + const osd_reqid_t& reqid) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + ghobject_t os = ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD); + if (ec_inject_write_failures0_reqid.count(reqid)) { + // Matched reqid of retried write - flag for failure + ec_inject_write_failures0_reqid.erase(reqid); + return true; + } + auto it = ec_inject_write_failures0.find(os); + if (it == ec_inject_write_failures0.end()) { + os.hobj.oid.name = "*"; + os.hobj.set_hash(0); + it = ec_inject_write_failures0.find(os); + } + if (it != ec_inject_write_failures0.end()) { + auto && [when, duration] = it->second; + auto shard = ec_inject_write_failures0_shard.find(os)->second; + if (when > 0) { + when--; + } else { + if (--duration <= 0) { + ec_inject_write_failures0.erase(it); + ec_inject_write_failures0_shard.erase(os); + } + // Error inject triggered - save reqid + ec_inject_write_failures0_reqid.insert(reqid); + // Set up error inject to drop message to primary + ec_inject_write_error(ghobject_t(o, ghobject_t::NO_GEN, shard), 1, 0, 1); + } + } + return false; +} + +bool ec_inject_test_write_error1(const ghobject_t& o) { + bool rc = ec_inject_test_error(o, &ec_inject_write_failures1); + if (rc) { + // Set up error inject to generate OSD down + ec_inject_write_error(o, 2, 0, 1); + } + return rc; +} + +bool ec_inject_test_write_error2(const hobject_t& o) { + return ec_inject_test_error( + ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + &ec_inject_write_failures2); +} + +bool ec_inject_test_write_error3(const hobject_t& o) { + return ec_inject_test_error( + ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + &ec_inject_write_failures3); +} diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index 7ff9cae7646..de4c11ad50f 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -493,6 +493,7 @@ struct ECCommon { ); ///< @return error code, 0 on success void schedule_recovery_work(); + }; /** @@ -843,3 +844,15 @@ void ECCommon::ReadPipeline::filter_read_op( on_schedule_recovery(op); } } + +// Error inject interfaces +std::string ec_inject_read_error(const ghobject_t& o, const int64_t type, const int64_t when, const int64_t duration); +std::string ec_inject_write_error(const ghobject_t& o, const int64_t type, const int64_t when, const int64_t duration); +std::string ec_inject_clear_read_error(const ghobject_t& o, const int64_t type); +std::string ec_inject_clear_write_error(const ghobject_t& o, const int64_t type); +bool ec_inject_test_read_error0(const ghobject_t& o); +bool ec_inject_test_read_error1(const ghobject_t& o); +bool ec_inject_test_write_error0(const hobject_t& o,const osd_reqid_t& reqid); +bool ec_inject_test_write_error1(const ghobject_t& o); +bool ec_inject_test_write_error2(const hobject_t& o); +bool ec_inject_test_write_error3(const hobject_t& o); diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h index 972228cd077..7dc1d4f7263 100644 --- a/src/osd/ExtentCache.h +++ b/src/osd/ExtentCache.h @@ -363,7 +363,7 @@ private: extent, boost::intrusive::list_member_hook<>, &extent::pin_list_member>; - using list = boost::intrusive::list<extent, list_member_options>; + using list = boost::intrusive::list<extent, boost::intrusive::constant_time_size<false>, list_member_options>; list pin_list; ~pin_state() { ceph_assert(pin_list.empty()); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 97fefc5e54a..9c9e540cf61 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -37,6 +37,7 @@ #include "osd/PG.h" #include "osd/scrubber/scrub_machine.h" #include "osd/scrubber/pg_scrubber.h" +#include "osd/ECCommon.h" #include "include/types.h" #include "include/compat.h" @@ -4348,6 +4349,46 @@ void OSD::final_init() "inject metadata error to an object"); ceph_assert(r == 0); r = admin_socket->register_command( + "injectecreaderr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false " \ + "name=when,type=CephInt,req=false " \ + "name=duration,type=CephInt,req=false", + test_ops_hook, + "inject error for read of object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectecclearreaderr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false", + test_ops_hook, + "clear read error injects for object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectecwriteerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false " \ + "name=when,type=CephInt,req=false " \ + "name=duration,type=CephInt,req=false", + test_ops_hook, + "inject error for write of object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectecclearwriteerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false", + test_ops_hook, + "clear write error inject for object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( "set_recovery_delay " \ "name=utime,type=CephInt,req=false", test_ops_hook, @@ -6487,8 +6528,10 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, //directly request the osd make a change. if (command == "setomapval" || command == "rmomapkey" || command == "setomapheader" || command == "getomap" || - command == "truncobj" || command == "injectmdataerr" || - command == "injectdataerr" + command == "truncobj" || + command == "injectmdataerr" || command == "injectdataerr" || + command == "injectecreaderr" || command == "injectecclearreaderr" || + command == "injectecwriteerr" || command == "injectecclearwriteerr" ) { pg_t rawpg; int64_t pool; @@ -6527,8 +6570,21 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid))); spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid)); if (curmap->pg_is_ec(rawpg)) { - if ((command != "injectdataerr") && (command != "injectmdataerr")) { - ss << "Must not call on ec pool, except injectdataerr or injectmdataerr"; + if ((command != "injectdataerr") && + (command != "injectmdataerr") && + (command != "injectecreaderr") && + (command != "injectecclearreaderr") && + (command != "injectecwriteerr") && + (command != "injectecclearwriteerr")) { + ss << "Must not call on ec pool"; + return; + } + } else { + if ((command == "injectecreaderr") || + (command == "injecteclearreaderr") || + (command == "injectecwriteerr") || + (command == "injecteclearwriteerr")) { + ss << "Only supported on ec pool"; return; } } @@ -6607,6 +6663,38 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, } else if (command == "injectmdataerr") { store->inject_mdata_error(gobj); ss << "ok"; + } else if (command == "injectecreaderr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + int64_t when = cmd_getval_or<int64_t>(cmdmap, "when", 0); + int64_t duration = cmd_getval_or<int64_t>(cmdmap, "duration", 1); + ss << ec_inject_read_error(gobj, type, when, duration); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } + } else if (command == "injectecclearreaderr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + ss << ec_inject_clear_read_error(gobj, type); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } + } else if (command == "injectecwriteerr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + int64_t when = cmd_getval_or<int64_t>(cmdmap, "when", 0); + int64_t duration = cmd_getval_or<int64_t>(cmdmap, "duration", 1); + ss << ec_inject_write_error(gobj, type, when, duration); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } + } else if (command == "injectecclearwriteerr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + ss << ec_inject_clear_write_error(gobj, type); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } } return; } @@ -9958,7 +10046,8 @@ const char** OSD::get_tracked_conf_keys() const "osd_scrub_max_interval", "osd_op_thread_timeout", "osd_op_thread_suicide_timeout", - NULL + "osd_max_scrubs", + nullptr }; return KEYS; } @@ -10002,6 +10091,10 @@ void OSD::handle_conf_change(const ConfigProxy& conf, service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs); } if (changed.count("osd_max_scrubs")) { + dout(0) << fmt::format( + "{}: scrub concurrency max changed to {}", + __func__, cct->_conf->osd_max_scrubs) + << dendl; service.scrub_reserver.set_max(cct->_conf->osd_max_scrubs); } if (changed.count("osd_op_complaint_time") || diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index b87484c1a9d..9b3593d54e5 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1642,12 +1642,10 @@ void OSDMap::get_out_of_subnet_osd_counts(CephContext *cct, for (int i = 0; i < max_osd; i++) { if (exists(i) && is_up(i)) { if (const auto& addrs = get_addrs(i).v; addrs.size() >= 2) { - auto v1_addr = addrs[0].ip_only_to_str(); - if (!is_addr_in_subnet(cct, public_network, v1_addr)) { + if (!is_addr_in_subnet(cct, public_network, addrs[0])) { unreachable->emplace(i); } - auto v2_addr = addrs[1].ip_only_to_str(); - if (!is_addr_in_subnet(cct, public_network, v2_addr)) { + if (!is_addr_in_subnet(cct, public_network, addrs[1])) { unreachable->emplace(i); } } diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index b87aa1da677..f5eb9ea951e 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -290,6 +290,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef; MessageRef, Connection *con) = 0; virtual void send_message_osd_cluster( Message *m, const ConnectionRef& con) = 0; + virtual void start_mon_command( + const std::vector<std::string>& cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs, + Context *onfinish) = 0; virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0; virtual entity_name_t get_cluster_msgr_name() = 0; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 44f8e85b5ef..3324ba9dc91 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -2286,6 +2286,16 @@ void PrimaryLogPG::do_op(OpRequestRef& op) } } + if (cct->_conf->bluestore_debug_inject_read_err && + op->may_write() && + pool.info.is_erasure() && + ec_inject_test_write_error0(m->get_hobj(), m->get_reqid())) { + // Fail retried write with error + dout(0) << __func__ << " Error inject - Fail retried write with EINVAL" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + ObjectContextRef obc; bool can_create = op->may_write(); hobject_t missing_oid; @@ -5798,10 +5808,19 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op) int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl) { - for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) { - char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0); - if (osd_op.indata[idx] != read_byte) { - return (-MAX_ERRNO - idx); + auto input_iter = osd_op.indata.begin(); + auto read_iter = read_bl.begin(); + uint64_t idx = 0; + + while (input_iter != osd_op.indata.end()) { + char read_byte = (read_iter != read_bl.end() ? *read_iter : 0); + if (*input_iter != read_byte) { + return (-MAX_ERRNO - idx); + } + ++idx; + ++input_iter; + if (read_iter != read_bl.end()) { + ++read_iter; } } @@ -7767,27 +7786,34 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) bool truncated = false; bufferlist bl; if (oi.is_omap()) { - ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( - ch, ghobject_t(soid) - ); - if (!iter) { - result = -ENOENT; - goto fail; - } - iter->upper_bound(start_after); - if (filter_prefix > start_after) iter->lower_bound(filter_prefix); - for (num = 0; - iter->valid() && - iter->key().substr(0, filter_prefix.size()) == filter_prefix; - ++num, iter->next()) { - dout(20) << "Found key " << iter->key() << dendl; - if (num >= max_return || - bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { - truncated = true; - break; - } - encode(iter->key(), bl); - encode(iter->value(), bl); + using omap_iter_seek_t = ObjectStore::omap_iter_seek_t; + result = osd->store->omap_iterate( + ch, ghobject_t(soid), + // try to seek as many keys-at-once as possible for the sake of performance. + // note complexity should be logarithmic, so seek(n/2) + seek(n/2) is worse + // than just seek(n). + ObjectStore::omap_iter_seek_t{ + .seek_position = std::max(start_after, filter_prefix), + .seek_type = filter_prefix > start_after ? omap_iter_seek_t::LOWER_BOUND + : omap_iter_seek_t::UPPER_BOUND + }, + [&bl, &truncated, &filter_prefix, &num, max_return, + max_bytes=cct->_conf->osd_max_omap_bytes_per_request] + (std::string_view key, std::string_view value) mutable { + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + return ObjectStore::omap_iter_ret_t::STOP; + } + if (num >= max_return || bl.length() >= max_bytes) { + truncated = true; + return ObjectStore::omap_iter_ret_t::STOP; + } + encode(key, bl); + encode(value, bl); + ++num; + return ObjectStore::omap_iter_ret_t::NEXT; + }); + if (result < 0) { + goto fail; } } // else return empty out_set encode(num, osd_op.outdata); diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index f66b5c6e16a..bf55d539821 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -622,6 +622,12 @@ public: Message *m, const ConnectionRef& con) override { osd->send_message_osd_cluster(m, con); } + void start_mon_command( + const std::vector<std::string>& cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs, + Context *onfinish) override { + osd->monc->start_mon_command(cmd, inbl, outbl, outs, onfinish); + } ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override; entity_name_t get_cluster_msgr_name() override { return osd->get_cluster_msgr_name(); @@ -1993,6 +1999,7 @@ public: private: DynamicPerfStats m_dynamic_perf_stats; + }; inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop) @@ -2021,5 +2028,4 @@ inline ostream& operator<<(ostream& out, void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop); void intrusive_ptr_release(PrimaryLogPG::RepGather *repop); - #endif diff --git a/src/osd/Session.h b/src/osd/Session.h index 9fa9c655456..05a0119d31e 100644 --- a/src/osd/Session.h +++ b/src/osd/Session.h @@ -136,7 +136,7 @@ struct Session : public RefCountedObject { ceph::mutex session_dispatch_lock = ceph::make_mutex("Session::session_dispatch_lock"); - boost::intrusive::list<OpRequest> waiting_on_map; + boost::intrusive::list<OpRequest, boost::intrusive::constant_time_size<false>> waiting_on_map; ceph::spinlock projected_epoch_lock; epoch_t projected_epoch = 0; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 5c2cf8b16b0..048f5aa0009 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2942,6 +2942,14 @@ std::string pg_stat_t::dump_scrub_schedule() const return fmt::format( "Blocked! locked objects (for {}s)", scrub_sched_status.m_duration_seconds); + } else if (scrub_sched_status.m_num_to_reserve != 0) { + // we are waiting for some replicas to respond + return fmt::format( + "Reserving. Waiting {}s for OSD.{} ({}/{})", + scrub_sched_status.m_duration_seconds, + scrub_sched_status.m_osd_to_respond, + scrub_sched_status.m_ordinal_of_requested_replica, + scrub_sched_status.m_num_to_reserve); } else { return fmt::format( "{}scrubbing for {}s", @@ -2964,7 +2972,7 @@ std::string pg_stat_t::dump_scrub_schedule() const case pg_scrub_sched_status_t::queued: return fmt::format( "queued for {}scrub", - ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : "")); + (scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""); default: // a bug! return "SCRUB STATE MISMATCH!"s; @@ -2979,12 +2987,15 @@ bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r) l.m_duration_seconds == r.m_duration_seconds && l.m_is_active == r.m_is_active && l.m_is_deep == r.m_is_deep && - l.m_is_periodic == r.m_is_periodic; + l.m_is_periodic == r.m_is_periodic && + l.m_osd_to_respond == r.m_osd_to_respond && + l.m_ordinal_of_requested_replica == r.m_ordinal_of_requested_replica && + l.m_num_to_reserve == r.m_num_to_reserve; } void pg_stat_t::encode(ceph::buffer::list &bl) const { - ENCODE_START(29, 22, bl); + ENCODE_START(30, 22, bl); encode(version, bl); encode(reported_seq, bl); encode(reported_epoch, bl); @@ -3044,6 +3055,9 @@ void pg_stat_t::encode(ceph::buffer::list &bl) const encode(objects_trimmed, bl); encode(snaptrim_duration, bl); encode(log_dups_size, bl); + encode(scrub_sched_status.m_osd_to_respond, bl); + encode(scrub_sched_status.m_ordinal_of_requested_replica, bl); + encode(scrub_sched_status.m_num_to_reserve, bl); ENCODE_FINISH(bl); } @@ -3052,7 +3066,7 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) { bool tmp; uint32_t old_state; - DECODE_START(29, bl); + DECODE_START(30, bl); decode(version, bl); decode(reported_seq, bl); decode(reported_epoch, bl); @@ -3142,6 +3156,18 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) if (struct_v >= 29) { decode(log_dups_size, bl); } + if (struct_v >= 30) { + uint16_t osd_to_respond; + decode(osd_to_respond, bl); + scrub_sched_status.m_osd_to_respond = osd_to_respond; + uint8_t tmp8; + decode(tmp8, bl); + scrub_sched_status.m_ordinal_of_requested_replica = tmp8; + decode(tmp8, bl); + scrub_sched_status.m_num_to_reserve = tmp8; + } else { + scrub_sched_status.m_num_to_reserve = 0; + } } DECODE_FINISH(bl); } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index b6f5335a0f5..485fddead7a 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1151,9 +1151,8 @@ public: bool is_set(key_t key) const; template<typename T> - void set(key_t key, const T &val) { - value_t value = val; - opts[key] = value; + void set(key_t key, T &&val) { + opts.insert_or_assign(key, std::forward<T>(val)); } template<typename T> @@ -2223,6 +2222,13 @@ struct pg_scrubbing_status_t { bool m_is_active{false}; scrub_level_t m_is_deep{scrub_level_t::shallow}; bool m_is_periodic{true}; + // the following are only relevant when we are reserving replicas: + uint16_t m_osd_to_respond{0}; + /// this is the n'th replica we are reserving (out of m_num_to_reserve) + uint8_t m_ordinal_of_requested_replica{0}; + /// the number of replicas we are reserving for scrubbing. 0 means we are not + /// in the process of reserving replicas. + uint8_t m_num_to_reserve{0}; }; bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r); diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index fea8c757040..ba83f6ac600 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -588,6 +588,10 @@ scrub_level_t PgScrubber::scrub_requested( return scrub_level_t::shallow; } + // abort an ongoing scrub, if it's of the lowest priority + // and stuck in replica reservations. + m_fsm->process_event(AbortIfReserving{}); + // update the relevant SchedTarget (either shallow or deep). Set its urgency // to either operator_requested or must_repair. Push it into the queue auto& trgt = m_scrub_job->get_target(scrub_level); @@ -2073,6 +2077,7 @@ void PgScrubber::scrub_finish() } cleanup_on_finish(); + m_active_target.reset(); if (do_auto_scrub) { request_rescrubbing(); } @@ -2413,6 +2418,16 @@ void PgScrubber::dump_active_scrubber(ceph::Formatter* f) const } else { f->dump_string("schedule", "scrubbing"); } + const auto maybe_register = m_fsm->get_reservation_status(); + if (maybe_register && maybe_register->m_num_to_reserve != 0) { + f->dump_bool("is_reserving_replicas", true); + f->dump_int("osd_to_respond", maybe_register->m_osd_to_respond); + f->dump_int("duration_seconds", maybe_register->m_duration_seconds); + f->dump_int("requested_in_order", maybe_register->m_ordinal_of_requested_replica); + f->dump_int("num_to_reserve", maybe_register->m_num_to_reserve); + } else { + f->dump_bool("is_reserving_replicas", false); + } } pg_scrubbing_status_t PgScrubber::get_schedule() const @@ -2441,7 +2456,7 @@ pg_scrubbing_status_t PgScrubber::get_schedule() const pg_scrub_sched_status_t::blocked, true, // active (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow), - false}; + (m_active_target->urgency() == urgency_t::periodic_regular)}; } else { int32_t dur_seconds = @@ -2452,9 +2467,11 @@ pg_scrubbing_status_t PgScrubber::get_schedule() const pg_scrub_sched_status_t::active, true, // active (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow), - false /* is periodic? unknown, actually */}; + (m_active_target->urgency() == urgency_t::periodic_regular)}; } } + + // not registered to be scrubbed? if (!m_scrub_job->is_registered()) { return pg_scrubbing_status_t{ utime_t{}, @@ -2465,8 +2482,34 @@ pg_scrubbing_status_t PgScrubber::get_schedule() const false}; } - // not taking 'no-*scrub' flags into account here. + // in session, but still reserving replicas? + const auto maybe_register = m_fsm->get_reservation_status(); + if (maybe_register) { + // note that if we are here, we are scrubbing (even though + // m_active is false). The 'maybe_register' attests to being in + // ReservingReplicas state, and m_active wasn't set yet. + dout(20) << fmt::format( + "{}:maybe_register: osd:{} {}s ({} of {})", __func__, + maybe_register->m_osd_to_respond, + maybe_register->m_duration_seconds, + maybe_register->m_ordinal_of_requested_replica, + maybe_register->m_num_to_reserve) + << dendl; + return pg_scrubbing_status_t{ + utime_t{}, + maybe_register->m_duration_seconds, + pg_scrub_sched_status_t::active, + true, // active + (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow), + (m_active_target->urgency() == urgency_t::periodic_regular), + maybe_register->m_osd_to_respond, + maybe_register->m_ordinal_of_requested_replica, + maybe_register->m_num_to_reserve}; + } + const auto first_ready = m_scrub_job->earliest_eligible(now_is); + // eligible for scrubbing, but not yet selected to be scrubbed? + // (not taking 'no-*scrub' flags into account here.) if (first_ready) { const auto& targ = first_ready->get(); return pg_scrubbing_status_t{ diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index da9466758f4..10866ce580a 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -106,6 +106,25 @@ ceph::timespan ScrubMachine::get_time_scrubbing() const return ceph::timespan{}; } +std::optional<pg_scrubbing_status_t> ScrubMachine::get_reservation_status() + const +{ + const auto resv_state = state_cast<const ReservingReplicas*>(); + if (!resv_state) { + return std::nullopt; + } + const auto session = state_cast<const Session*>(); + dout(30) << fmt::format( + "{}: we are reserving {:p}-{:p}", __func__, (void*)session, + (void*)resv_state) + << dendl; + if (!session || !session->m_reservations) { + dout(20) << fmt::format("{}: no reservations data", __func__) << dendl; + return std::nullopt; + } + return session->get_reservation_status(); +} + // ////////////// the actual actions // ----------------------- NotActive ----------------------------------------- @@ -203,6 +222,23 @@ sc::result Session::react(const IntervalChanged&) return transit<NotActive>(); } +std::optional<pg_scrubbing_status_t> Session::get_reservation_status() const +{ + if (!m_reservations) { + return std::nullopt; + } + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + const auto req = m_reservations->get_last_sent(); + pg_scrubbing_status_t s; + s.m_osd_to_respond = req ? req->osd : 0; + s.m_ordinal_of_requested_replica = m_reservations->active_requests_cnt(); + s.m_num_to_reserve = scrbr->get_pg()->get_actingset().size() - 1; + s.m_duration_seconds = + duration_cast<seconds>(context<ScrubMachine>().get_time_scrubbing()) + .count(); + return s; +} + // ----------------------- ReservingReplicas --------------------------------- diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index ad0d3bfba38..f7f739692bf 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 smarttab #pragma once +#include <optional> #include <string> #include <boost/statechart/custom_reaction.hpp> @@ -160,6 +161,11 @@ VALUE_EVENT(ReserverGranted, AsyncScrubResData); /// all replicas have granted our reserve request MEV(RemotesReserved) +/// abort the scrub session, if in ReservingReplicas state +/// (used when the operator issues a scrub request, and we no longer +/// need the reservations) +MEV(AbortIfReserving) + /// initiate a new scrubbing session (relevant if we are a Primary) MEV(StartScrub) @@ -289,9 +295,12 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> { [[nodiscard]] bool is_accepting_updates() const; [[nodiscard]] bool is_primary_idle() const; - // elapsed time for the currently active scrub.session + /// elapsed time for the currently active scrub.session ceph::timespan get_time_scrubbing() const; + /// replica reservation process status + std::optional<pg_scrubbing_status_t> get_reservation_status() const; + // ///////////////// aux declarations & functions //////////////////////// // @@ -555,6 +564,9 @@ struct Session : sc::state<Session, PrimaryActive, ReservingReplicas>, /// abort reason - if known. Determines the delay time imposed on the /// failed scrub target. std::optional<Scrub::delay_cause_t> m_abort_reason{std::nullopt}; + + /// when reserving replicas: fetch the reservation status + std::optional<pg_scrubbing_status_t> get_reservation_status() const; }; struct ReservingReplicas : sc::state<ReservingReplicas, Session>, NamedSimply { @@ -563,6 +575,7 @@ struct ReservingReplicas : sc::state<ReservingReplicas, Session>, NamedSimply { using reactions = mpl::list< sc::custom_reaction<ReplicaGrant>, sc::custom_reaction<ReplicaReject>, + sc::transition<AbortIfReserving, PrimaryIdle>, sc::transition<RemotesReserved, ActiveScrubbing>>; ScrubTimePoint entered_at = ScrubClock::now(); diff --git a/src/osd/scrubber/scrub_reservations.h b/src/osd/scrubber/scrub_reservations.h index 173b23d7db5..f5eca48b888 100644 --- a/src/osd/scrubber/scrub_reservations.h +++ b/src/osd/scrubber/scrub_reservations.h @@ -157,13 +157,13 @@ class ReplicaReservations { // note: 'public', as accessed via the 'standard' dout_prefix() macro std::ostream& gen_prefix(std::ostream& out, std::string fn) const; + /// The number of requests that have been sent (and not rejected) so far. + size_t active_requests_cnt() const; + private: /// send 'release' messages to all replicas we have managed to reserve void release_all(); - /// The number of requests that have been sent (and not rejected) so far. - size_t active_requests_cnt() const; - /** * Send a reservation request to the next replica. * - if there are no more replicas to send requests to, return true diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 087b623333b..82d43bb3dde 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1393,7 +1393,7 @@ void Objecter::handle_osd_map(MOSDMap *m) for (auto& [c, ec] : p->second) { asio::post(service.get_executor(), asio::append(std::move(c), ec)); } - waiting_for_map.erase(p++); + p = waiting_for_map.erase(p); } monc->sub_got("osdmap", osdmap->get_epoch()); diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index f1c56d75378..550604fc55b 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -2036,8 +2036,8 @@ class CertKeyStore(): var = service_name if entity in self.service_name_cert else host j = {} self.known_certs[entity][var] = cert_obj - for service_name in self.known_certs[entity].keys(): - j[var] = Cert.to_json(self.known_certs[entity][var]) + for cert_key in self.known_certs[entity]: + j[cert_key] = Cert.to_json(self.known_certs[entity][cert_key]) else: self.known_certs[entity] = cert_obj j = Cert.to_json(cert_obj) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index bf14f8d1715..6690153d435 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2460,7 +2460,7 @@ Then run the following: @handle_orch_error def service_action(self, action: str, service_name: str) -> List[str]: - if service_name not in self.spec_store.all_specs.keys(): + if service_name not in self.spec_store.all_specs.keys() and service_name != 'osd': raise OrchestratorError(f'Invalid service name "{service_name}".' + ' View currently running services using "ceph orch ls"') dds: List[DaemonDescription] = self.cache.get_daemons_by_service(service_name) @@ -3925,6 +3925,50 @@ Then run the following: return self.to_remove_osds.all_osds() @handle_orch_error + def set_osd_spec(self, service_name: str, osd_ids: List[str]) -> str: + """ + Update unit.meta file for osd with service name + """ + if service_name not in self.spec_store: + raise OrchestratorError(f"Cannot find service '{service_name}' in the inventory. " + "Please try again after applying an OSD service that matches " + "the service name to which you want to attach OSDs.") + + daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('osd') + update_osd = defaultdict(list) + for daemon in daemons: + if daemon.daemon_id in osd_ids and daemon.hostname: + update_osd[daemon.hostname].append(daemon.daemon_id) + + if not update_osd: + raise OrchestratorError(f"Unable to find OSDs: {osd_ids}") + + failed_osds = [] + success_osds = [] + for host in update_osd: + osds = ",".join(update_osd[host]) + # run cephadm command with all host osds on specific host, + # if it fails, continue with other hosts + try: + with self.async_timeout_handler(host): + outs, errs, _code = self.wait_async( + CephadmServe(self)._run_cephadm(host, + cephadmNoImage, + 'update-osd-service', + ['--service-name', service_name, '--osd-ids', osds])) + if _code: + self.log.error(f"Failed to update service for {osds} osd. Cephadm error: {errs}") + failed_osds.extend(update_osd[host]) + else: + success_osds.extend(update_osd[host]) + except Exception: + self.log.exception(f"Failed to set service name for {osds}") + failed_osds.extend(update_osd[host]) + self.cache.invalidate_host_daemons(host) + self._kick_serve_loop() + return f"Updated service for osd {','.join(success_osds)}" + (f" and failed for {','.join(failed_osds)}" if failed_osds else "") + + @handle_orch_error @host_exists() def drain_host(self, hostname: str, force: bool = False, keep_conf_keyring: bool = False, zap_osd_devices: bool = False) -> str: """ diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py index 98d2fe99897..04d3712c50a 100644 --- a/src/pybind/mgr/cephadm/schedule.py +++ b/src/pybind/mgr/cephadm/schedule.py @@ -385,6 +385,8 @@ class HostAssignment(object): def find_ip_on_host(self, hostname: str, subnets: List[str]) -> Optional[str]: for subnet in subnets: + # to normalize subnet + subnet = str(ipaddress.ip_network(subnet)) ips: List[str] = [] # following is to allow loopback interfaces for both ipv4 and ipv6. Since we # only have the subnet (and no IP) we assume default loopback IP address. diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 04f5af28a9b..4f83d7bb0fb 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -1157,6 +1157,14 @@ class RgwService(CephService): 'value': str(spec.rgw_bucket_counters_cache_size), }) + if getattr(spec, 'disable_multisite_sync_traffic', None) is not None: + ret, out, err = self.mgr.check_mon_command({ + 'prefix': 'config set', + 'who': daemon_name, + 'name': 'rgw_run_sync_thread', + 'value': 'false' if spec.disable_multisite_sync_traffic else 'true', + }) + daemon_spec.keyring = keyring daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 1b9cf618570..9c5b5a112f3 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -3,6 +3,7 @@ import logging import os import socket from typing import List, Any, Tuple, Dict, Optional, cast +import ipaddress from mgr_module import HandleCommandResult @@ -57,6 +58,8 @@ class GrafanaService(CephadmService): if ip_to_bind_to: daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to} grafana_ip = ip_to_bind_to + if ipaddress.ip_network(grafana_ip).version == 6: + grafana_ip = f"[{grafana_ip}]" domain = self.mgr.get_fqdn(daemon_spec.host) mgmt_gw_ips = [] @@ -354,6 +357,13 @@ class AlertmanagerService(CephadmService): addr = self.mgr.get_fqdn(dd.hostname) peers.append(build_url(host=addr, port=port).lstrip('/')) + ip_to_bind_to = '' + if spec.only_bind_port_on_networks and spec.networks: + assert daemon_spec.host is not None + ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or '' + if ip_to_bind_to: + daemon_spec.port_ips = {str(port): ip_to_bind_to} + deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') if security_enabled: alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() @@ -376,7 +386,8 @@ class AlertmanagerService(CephadmService): }, 'peers': peers, 'web_config': '/etc/alertmanager/web.yml', - 'use_url_prefix': mgmt_gw_enabled + 'use_url_prefix': mgmt_gw_enabled, + 'ip_to_bind_to': ip_to_bind_to }, sorted(deps) else: return { @@ -384,7 +395,8 @@ class AlertmanagerService(CephadmService): "alertmanager.yml": yml }, "peers": peers, - 'use_url_prefix': mgmt_gw_enabled + 'use_url_prefix': mgmt_gw_enabled, + 'ip_to_bind_to': ip_to_bind_to }, sorted(deps) def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index b3fd526815e..8acec94f382 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -47,6 +47,7 @@ class NvmeofService(CephService): # TODO: check if we can force jinja2 to generate dicts with double quotes instead of using json.dumps transport_tcp_options = json.dumps(spec.transport_tcp_options) if spec.transport_tcp_options else None + iobuf_options = json.dumps(spec.iobuf_options) if spec.iobuf_options else None name = '{}.{}'.format(utils.name_to_config_section('nvmeof'), nvmeof_gw_id) rados_id = name[len('client.'):] if name.startswith('client.') else name @@ -67,6 +68,7 @@ class NvmeofService(CephService): 'rpc_socket_dir': '/var/tmp/', 'rpc_socket_name': 'spdk.sock', 'transport_tcp_options': transport_tcp_options, + 'iobuf_options': iobuf_options, 'rados_id': rados_id } gw_conf = self.mgr.template.render('services/nvmeof/ceph-nvmeof.conf.j2', context) diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py index 1622cb001ab..acb5a77c51b 100644 --- a/src/pybind/mgr/cephadm/ssh.py +++ b/src/pybind/mgr/cephadm/ssh.py @@ -358,7 +358,7 @@ class SSHManager: await self._check_execute_command(host, chown, addr=addr) chmod = RemoteCommand(Executables.CHMOD, [oct(mode)[2:], tmp_path]) await self._check_execute_command(host, chmod, addr=addr) - mv = RemoteCommand(Executables.MV, [tmp_path, path]) + mv = RemoteCommand(Executables.MV, ['-Z', tmp_path, path]) await self._check_execute_command(host, mv, addr=addr) except Exception as e: msg = f"Unable to write {host}:{path}: {e}" diff --git a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 index de993cb6ce3..b6955caf616 100644 --- a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 @@ -8,6 +8,8 @@ global: tls_config: {% if security_enabled %} ca_file: root_cert.pem + cert_file: alertmanager.crt + key_file: alertmanager.key {% else %} insecure_skip_verify: true {% endif %} diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 index b9773ceeeb3..14af0fd48ca 100644 --- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 @@ -9,6 +9,7 @@ events { http { #access_log /dev/stdout; + error_log /dev/stderr info; client_header_buffer_size 32K; large_client_header_buffers 4 32k; proxy_busy_buffers_size 512k; diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 index 0b84ee1bfab..2a9ab309568 100644 --- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 @@ -18,6 +18,7 @@ prometheus_exporter_ssl = False prometheus_port = {{ spec.prometheus_port }} prometheus_stats_interval = {{ spec.prometheus_stats_interval }} verify_nqns = {{ spec.verify_nqns }} +verify_keys = {{ spec.verify_keys }} omap_file_lock_duration = {{ spec.omap_file_lock_duration }} omap_file_lock_retries = {{ spec.omap_file_lock_retries }} omap_file_lock_retry_sleep_interval = {{ spec.omap_file_lock_retry_sleep_interval }} @@ -85,6 +86,9 @@ transport_tcp_options = {{ transport_tcp_options }} {% if spec.tgt_cmd_extra_args %} tgt_cmd_extra_args = {{ spec.tgt_cmd_extra_args }} {% endif %} +{% if iobuf_options %} +iobuf_options = {{ iobuf_options }} +{% endif %} [monitor] timeout = {{ spec.monitor_timeout }} diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index ecfd899af71..961da145dac 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -28,6 +28,8 @@ alerting: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} - scheme: http http_sd_configs: @@ -56,6 +58,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} honor_labels: true http_sd_configs: @@ -81,6 +85,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ node_exporter_sd_url }} @@ -104,6 +110,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ haproxy_sd_url }} @@ -128,6 +136,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} honor_labels: true http_sd_configs: @@ -149,6 +159,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ nvmeof_sd_url }} @@ -169,6 +181,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ nfs_sd_url }} @@ -189,6 +203,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ smb_sd_url }} diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index b81510504d9..22bd26def91 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1741,16 +1741,23 @@ class TestCephadm(object): nvmeof_client_cert = 'fake-nvmeof-client-cert' nvmeof_server_cert = 'fake-nvmeof-server-cert' nvmeof_root_ca_cert = 'fake-nvmeof-root-ca-cert' + grafana_cert_host_1 = 'grafana-cert-host-1' + grafana_cert_host_2 = 'grafana-cert-host-2' cephadm_module.cert_key_store.save_cert('rgw_frontend_ssl_cert', rgw_frontend_rgw_foo_host2_cert, service_name='rgw.foo', user_made=True) cephadm_module.cert_key_store.save_cert('nvmeof_server_cert', nvmeof_server_cert, service_name='nvmeof.foo', user_made=True) cephadm_module.cert_key_store.save_cert('nvmeof_client_cert', nvmeof_client_cert, service_name='nvmeof.foo', user_made=True) cephadm_module.cert_key_store.save_cert('nvmeof_root_ca_cert', nvmeof_root_ca_cert, service_name='nvmeof.foo', user_made=True) + cephadm_module.cert_key_store.save_cert('grafana_cert', grafana_cert_host_1, host='host-1', user_made=True) + cephadm_module.cert_key_store.save_cert('grafana_cert', grafana_cert_host_2, host='host-2', user_made=True) expected_calls = [ mock.call(f'{CERT_STORE_CERT_PREFIX}rgw_frontend_ssl_cert', json.dumps({'rgw.foo': Cert(rgw_frontend_rgw_foo_host2_cert, True).to_json()})), mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_server_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_server_cert, True).to_json()})), mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_client_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_client_cert, True).to_json()})), mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_root_ca_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_root_ca_cert, True).to_json()})), + mock.call(f'{CERT_STORE_CERT_PREFIX}grafana_cert', json.dumps({'host-1': Cert(grafana_cert_host_1, True).to_json()})), + mock.call(f'{CERT_STORE_CERT_PREFIX}grafana_cert', json.dumps({'host-1': Cert(grafana_cert_host_1, True).to_json(), + 'host-2': Cert(grafana_cert_host_2, True).to_json()})) ] _set_store.assert_has_calls(expected_calls) @@ -1795,17 +1802,20 @@ class TestCephadm(object): cephadm_module.cert_key_store._init_known_cert_key_dicts() grafana_host1_key = 'fake-grafana-host1-key' + grafana_host2_key = 'fake-grafana-host2-key' nvmeof_client_key = 'nvmeof-client-key' nvmeof_server_key = 'nvmeof-server-key' nvmeof_encryption_key = 'nvmeof-encryption-key' - grafana_host1_key = 'fake-grafana-host1-cert' cephadm_module.cert_key_store.save_key('grafana_key', grafana_host1_key, host='host1') + cephadm_module.cert_key_store.save_key('grafana_key', grafana_host2_key, host='host2') cephadm_module.cert_key_store.save_key('nvmeof_client_key', nvmeof_client_key, service_name='nvmeof.foo') cephadm_module.cert_key_store.save_key('nvmeof_server_key', nvmeof_server_key, service_name='nvmeof.foo') cephadm_module.cert_key_store.save_key('nvmeof_encryption_key', nvmeof_encryption_key, service_name='nvmeof.foo') expected_calls = [ mock.call(f'{CERT_STORE_KEY_PREFIX}grafana_key', json.dumps({'host1': PrivKey(grafana_host1_key).to_json()})), + mock.call(f'{CERT_STORE_KEY_PREFIX}grafana_key', json.dumps({'host1': PrivKey(grafana_host1_key).to_json(), + 'host2': PrivKey(grafana_host2_key).to_json()})), mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_client_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_client_key).to_json()})), mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_server_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_server_key).to_json()})), mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_encryption_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_encryption_key).to_json()})), diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 6e117921ab3..d872219df80 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -408,6 +408,7 @@ prometheus_exporter_ssl = False prometheus_port = 10008 prometheus_stats_interval = 10 verify_nqns = True +verify_keys = True omap_file_lock_duration = 20 omap_file_lock_retries = 30 omap_file_lock_retry_sleep_interval = 1.0 @@ -580,7 +581,14 @@ class TestMonitoring: mock_getfqdn.return_value = purl.hostname with with_host(cephadm_module, "test"): - with with_service(cephadm_module, AlertManagerSpec()): + cephadm_module.cache.update_host_networks('test', { + '1.2.3.0/24': { + 'if0': ['1.2.3.1'] + }, + }) + with with_service(cephadm_module, AlertManagerSpec('alertmanager', + networks=['1.2.3.0/24'], + only_bind_port_on_networks=True)): y = dedent(self._get_config(expected_yaml_url)).lstrip() _run_cephadm.assert_called_with( 'test', @@ -594,11 +602,12 @@ class TestMonitoring: "deploy_arguments": [], "params": { 'tcp_ports': [9093, 9094], + 'port_ips': {"9094": "1.2.3.1"}, }, "meta": { 'service_name': 'alertmanager', 'ports': [9093, 9094], - 'ip': None, + 'ip': '1.2.3.1', 'deployed_by': [], 'rank': None, 'rank_generation': None, @@ -611,6 +620,7 @@ class TestMonitoring: }, "peers": [], "use_url_prefix": False, + "ip_to_bind_to": "1.2.3.1", } }), error_ok=True, @@ -633,8 +643,16 @@ class TestMonitoring: cephadm_module.secure_monitoring_stack = True cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user') cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password') + + cephadm_module.cache.update_host_networks('test', { + 'fd12:3456:789a::/64': { + 'if0': ['fd12:3456:789a::10'] + }, + }) with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \ - with_service(cephadm_module, AlertManagerSpec()): + with_service(cephadm_module, AlertManagerSpec('alertmanager', + networks=['fd12:3456:789a::/64'], + only_bind_port_on_networks=True)): y = dedent(""" # This file is generated by cephadm. @@ -645,6 +663,8 @@ class TestMonitoring: http_config: tls_config: ca_file: root_cert.pem + cert_file: alertmanager.crt + key_file: alertmanager.key route: receiver: 'default' @@ -685,11 +705,12 @@ class TestMonitoring: "deploy_arguments": [], "params": { 'tcp_ports': [9093, 9094], + 'port_ips': {"9094": "fd12:3456:789a::10"} }, "meta": { 'service_name': 'alertmanager', 'ports': [9093, 9094], - 'ip': None, + 'ip': 'fd12:3456:789a::10', 'deployed_by': [], 'rank': None, 'rank_generation': None, @@ -707,6 +728,7 @@ class TestMonitoring: 'peers': [], 'web_config': '/etc/alertmanager/web.yml', "use_url_prefix": True, + "ip_to_bind_to": "fd12:3456:789a::10", } }), error_ok=True, @@ -740,6 +762,8 @@ class TestMonitoring: http_config: tls_config: ca_file: root_cert.pem + cert_file: alertmanager.crt + key_file: alertmanager.key route: receiver: 'default' @@ -800,6 +824,7 @@ class TestMonitoring: 'peers': [], 'web_config': '/etc/alertmanager/web.yml', "use_url_prefix": False, + "ip_to_bind_to": "", } }), error_ok=True, @@ -1169,6 +1194,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key scrape_configs: - job_name: 'ceph' @@ -1190,6 +1217,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'node' relabel_configs: @@ -1208,6 +1237,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'haproxy' relabel_configs: @@ -1224,6 +1255,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'ceph-exporter' relabel_configs: @@ -1241,6 +1274,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'nvmeof' honor_labels: true @@ -1254,6 +1289,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'nfs' honor_labels: true @@ -1267,6 +1304,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'smb' honor_labels: true @@ -1280,6 +1319,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key """).lstrip() @@ -2070,6 +2111,26 @@ class TestRGWService: }) assert f == expected + @pytest.mark.parametrize( + "disable_sync_traffic", + [ + (True), + (False), + ] + ) + @patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) + def test_rgw_disable_sync_traffic(self, disable_sync_traffic, cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'host1'): + s = RGWSpec(service_id="foo", + disable_multisite_sync_traffic=disable_sync_traffic) + with with_service(cephadm_module, s) as dds: + _, f, _ = cephadm_module.check_mon_command({ + 'prefix': 'config get', + 'who': f'client.{dds[0]}', + 'key': 'rgw_run_sync_thread', + }) + assert f == ('false' if disable_sync_traffic else 'true') + class TestMonService: @@ -3873,6 +3934,7 @@ class TestMgmtGateway: http { #access_log /dev/stdout; + error_log /dev/stderr info; client_header_buffer_size 32K; large_client_header_buffers 4 32k; proxy_busy_buffers_size 512k; @@ -4120,6 +4182,7 @@ class TestMgmtGateway: http { #access_log /dev/stdout; + error_log /dev/stderr info; client_header_buffer_size 32K; large_client_header_buffers 4 32k; proxy_busy_buffers_size 512k; diff --git a/src/pybind/mgr/dashboard/HACKING.rst b/src/pybind/mgr/dashboard/HACKING.rst index 39c3d6744b9..6da428a0d5f 100644 --- a/src/pybind/mgr/dashboard/HACKING.rst +++ b/src/pybind/mgr/dashboard/HACKING.rst @@ -4,7 +4,7 @@ Ceph Dashboard Developer Documentation Note: The content of this file has been moved into the Ceph Developer Guide. If you're interested in helping with the development of the dashboard, please -see ``/doc/dev/developer_guide/dash_devel.rst`` or the `online version +see ``/doc/dev/developer_guide/dash-devel.rst`` or the `online version <https://ceph.readthedocs.io/en/latest/dev/developer_guide/dash-devel/>`_ for details on how to set up a development environment and other development-related topics. diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index a505801eea5..4fbc975ae9f 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -747,6 +747,10 @@ class Orchestrator(object): """ raise NotImplementedError() + def set_osd_spec(self, service_name: str, osd_ids: List[str]) -> OrchResult: + """ set service of osd """ + raise NotImplementedError() + def blink_device_light(self, ident_fault: str, on: bool, locations: List['DeviceLightLoc']) -> OrchResult[List[str]]: """ Instructs the orchestrator to enable or disable either the ident or the fault LED. diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 332bc75d862..d5a1bb3da2b 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1472,6 +1472,14 @@ Usage: return HandleCommandResult(stdout=out) + @_cli_write_command('orch osd set-spec-affinity') + def _osd_set_spec(self, service_name: str, osd_id: List[str]) -> HandleCommandResult: + """Set service spec affinity for osd""" + completion = self.set_osd_spec(service_name, osd_id) + res = raise_if_exception(completion) + + return HandleCommandResult(stdout=res) + @_cli_write_command('orch daemon add') def daemon_add_misc(self, daemon_type: Optional[ServiceType] = None, @@ -1666,7 +1674,13 @@ Usage: specs: List[Union[ServiceSpec, HostSpec]] = [] # YAML '---' document separator with no content generates # None entries in the output. Let's skip them silently. - content = [o for o in yaml_objs if o is not None] + try: + content = [o for o in yaml_objs if o is not None] + except yaml.scanner.ScannerError as e: + msg = f"Invalid YAML received : {str(e)}" + self.log.exception(msg) + return HandleCommandResult(-errno.EINVAL, stderr=msg) + for s in content: try: spec = json_to_generic_spec(s) @@ -2191,7 +2205,13 @@ Usage: specs: List[TunedProfileSpec] = [] # YAML '---' document separator with no content generates # None entries in the output. Let's skip them silently. - content = [o for o in yaml_objs if o is not None] + try: + content = [o for o in yaml_objs if o is not None] + except yaml.scanner.ScannerError as e: + msg = f"Invalid YAML received : {str(e)}" + self.log.exception(msg) + return HandleCommandResult(-errno.EINVAL, stderr=msg) + for spec in content: specs.append(TunedProfileSpec.from_json(spec)) else: diff --git a/src/pybind/mgr/snap_schedule/fs/schedule_client.py b/src/pybind/mgr/snap_schedule/fs/schedule_client.py index b58f20f1275..12e5e980737 100644 --- a/src/pybind/mgr/snap_schedule/fs/schedule_client.py +++ b/src/pybind/mgr/snap_schedule/fs/schedule_client.py @@ -163,6 +163,7 @@ class SnapSchedClient(CephfsClient): self.sqlite_connections: Dict[str, DBInfo] = {} self.active_timers: Dict[Tuple[str, str], List[Timer]] = {} self.conn_lock: Lock = Lock() # lock to protect add/lookup db connections + self.timers_lock: Lock = Lock() # restart old schedules for fs_name in self.get_all_filesystems(): @@ -273,6 +274,27 @@ class SnapSchedClient(CephfsClient): if self._is_allowed_repeat(r, path)][0:1] return rows + def delete_references_to_unavailable_fs(self, available_fs_names: Set[str]) -> None: + fs_to_remove: Set[str] = set() + self.timers_lock.acquire() + for fs, path in list(self.active_timers.keys()): # each key is a tuple + if fs not in available_fs_names: + fs_to_remove.add(fs) + log.debug(f'Cancelled timers for "{fs}:{path}"') + for t in self.active_timers[(fs, path)]: + t.cancel() + log.debug(f'Removed timer instance for "{fs}"') + del self.active_timers[(fs, path)] + self.timers_lock.release() + + self.conn_lock.acquire() + for fs in fs_to_remove: + log.debug(f'Closed DB connection to "{fs}"') + self.sqlite_connections[fs].db.close() + log.debug(f'Removed DB connection to "{fs}"') + del self.sqlite_connections[fs] + self.conn_lock.release() + def refresh_snap_timers(self, fs: str, path: str, olddb: Optional[sqlite3.Connection] = None) -> None: try: log.debug((f'SnapDB on {fs} changed for {path}, ' @@ -286,6 +308,7 @@ class SnapSchedClient(CephfsClient): with self.get_schedule_db(fs) as conn_mgr: db = conn_mgr.dbinfo.db rows = self.fetch_schedules(db, path) + self.timers_lock.acquire() timers = self.active_timers.get((fs, path), []) for timer in timers: timer.cancel() @@ -299,6 +322,7 @@ class SnapSchedClient(CephfsClient): timers.append(t) log.debug(f'Will snapshot {path} in fs {fs} in {row[1]}s') self.active_timers[(fs, path)] = timers + self.timers_lock.release() except Exception: self._log_exception('refresh_snap_timers') diff --git a/src/pybind/mgr/snap_schedule/module.py b/src/pybind/mgr/snap_schedule/module.py index d8f04a62b94..adf982448b1 100644 --- a/src/pybind/mgr/snap_schedule/module.py +++ b/src/pybind/mgr/snap_schedule/module.py @@ -8,12 +8,14 @@ import json import sqlite3 from typing import Any, Dict, Optional, Tuple, Union from .fs.schedule_client import SnapSchedClient -from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option +from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option, NotifyType from mgr_util import CephfsConnectionException from threading import Event class Module(MgrModule): + NOTIFY_TYPES = [NotifyType.fs_map] + MODULE_OPTIONS = [ Option( 'allow_m_granularity', @@ -37,6 +39,21 @@ class Module(MgrModule): self._initialized = Event() self.client = SnapSchedClient(self) + def notify(self, notify_type: NotifyType, notify_id: str) -> None: + if notify_type != NotifyType.fs_map: + return + fs_map = self.get('fs_map') + if not fs_map: + return + + # we don't know for which fs config has been changed + fs_names = set() + for fs in fs_map['filesystems']: + fs_name = fs['mdsmap']['fs_name'] + fs_names.add(fs_name) + + self.client.delete_references_to_unavailable_fs(fs_names) + def _subvolume_exist(self, fs: str, subvol: Union[str, None], group: Union[str, None]) -> bool: rc, subvolumes, err = self.remote('volumes', 'subvolume_ls', fs, group) if rc == 0: diff --git a/src/pybind/mgr/volumes/fs/operations/volume.py b/src/pybind/mgr/volumes/fs/operations/volume.py index b2574fd76d5..93844bce119 100644 --- a/src/pybind/mgr/volumes/fs/operations/volume.py +++ b/src/pybind/mgr/volumes/fs/operations/volume.py @@ -133,7 +133,11 @@ def delete_volume(mgr, volname, metadata_pool, data_pools): r, outb, outs = remove_pool(mgr, data_pool) if r != 0: return r, outb, outs - result_str = "metadata pool: {0} data pool: {1} removed".format(metadata_pool, str(data_pools)) + result_str = f"metadata pool: {metadata_pool} data pool: {str(data_pools)} removed.\n" + result_str += "If there are active snapshot schedules associated with this " + result_str += "volume, you might see EIO errors in the mgr logs or at the " + result_str += "snap-schedule command-line due to the missing volume. " + result_str += "However, these errors are transient and will get auto-resolved." return r, result_str, "" def rename_volume(mgr, volname: str, newvolname: str) -> Tuple[int, str, str]: diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx index b54ebb483c6..bcfa6777f3d 100644 --- a/src/pybind/rados/rados.pyx +++ b/src/pybind/rados/rados.pyx @@ -1870,7 +1870,7 @@ cdef class WriteOp(object): uint64_t _offset = offset with nogil: - rados_write_op_zero(self.write_op, _length, _offset) + rados_write_op_zero(self.write_op, _offset, _length) def truncate(self, offset: int): """ diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 9fdecb30346..6869d5b2188 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1231,6 +1231,7 @@ class RGWSpec(ServiceSpec): rgw_bucket_counters_cache: Optional[bool] = False, rgw_bucket_counters_cache_size: Optional[int] = None, generate_cert: bool = False, + disable_multisite_sync_traffic: Optional[bool] = None, ): assert service_type == 'rgw', service_type @@ -1283,6 +1284,8 @@ class RGWSpec(ServiceSpec): self.rgw_bucket_counters_cache_size = rgw_bucket_counters_cache_size #: Whether we should generate a cert/key for the user if not provided self.generate_cert = generate_cert + #: Used to make RGW not do multisite replication so it can dedicate to IO + self.disable_multisite_sync_traffic = disable_multisite_sync_traffic def get_port_start(self) -> List[int]: return [self.get_port()] @@ -1351,6 +1354,7 @@ class NvmeofServiceSpec(ServiceSpec): prometheus_stats_interval: Optional[int] = 10, bdevs_per_cluster: Optional[int] = 32, verify_nqns: Optional[bool] = True, + verify_keys: Optional[bool] = True, allowed_consecutive_spdk_ping_failures: Optional[int] = 1, spdk_ping_interval_in_seconds: Optional[float] = 2.0, ping_spdk_under_lock: Optional[bool] = False, @@ -1380,6 +1384,7 @@ class NvmeofServiceSpec(ServiceSpec): transport_tcp_options: Optional[Dict[str, int]] = {"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7}, tgt_cmd_extra_args: Optional[str] = None, + iobuf_options: Optional[Dict[str, int]] = None, discovery_addr: Optional[str] = None, discovery_addr_map: Optional[Dict[str, str]] = None, discovery_port: Optional[int] = None, @@ -1450,6 +1455,8 @@ class NvmeofServiceSpec(ServiceSpec): self.prometheus_stats_interval = prometheus_stats_interval #: ``verify_nqns`` enables verification of subsystem and host NQNs for validity self.verify_nqns = verify_nqns + #: ``verify_keys`` enables verification of PSJ and DHCHAP keys in the gateway + self.verify_keys = verify_keys #: ``omap_file_lock_duration`` number of seconds before automatically unlock OMAP file lock self.omap_file_lock_duration = omap_file_lock_duration #: ``omap_file_lock_retries`` number of retries to lock OMAP file before giving up @@ -1514,6 +1521,8 @@ class NvmeofServiceSpec(ServiceSpec): self.transport_tcp_options: Optional[Dict[str, int]] = transport_tcp_options #: ``tgt_cmd_extra_args`` extra arguments for the nvmf_tgt process self.tgt_cmd_extra_args = tgt_cmd_extra_args + #: List of extra arguments for SPDK iobuf in the form opt=value + self.iobuf_options: Optional[Dict[str, int]] = iobuf_options #: ``discovery_addr`` address of the discovery service self.discovery_addr = discovery_addr #: ``discovery_addr_map`` per node address map of the discovery service @@ -1622,6 +1631,7 @@ class NvmeofServiceSpec(ServiceSpec): verify_boolean(self.enable_key_encryption, "Enable key encryption") verify_boolean(self.enable_prometheus_exporter, "Enable Prometheus exporter") verify_boolean(self.verify_nqns, "Verify NQNs") + verify_boolean(self.verify_keys, "Verify Keys") verify_boolean(self.log_files_enabled, "Log files enabled") verify_boolean(self.log_files_rotation_enabled, "Log files rotation enabled") verify_boolean(self.verbose_log_messages, "Verbose log messages") @@ -2324,6 +2334,7 @@ class AlertManagerSpec(MonitoringSpec): user_data: Optional[Dict[str, Any]] = None, config: Optional[Dict[str, str]] = None, networks: Optional[List[str]] = None, + only_bind_port_on_networks: bool = False, port: Optional[int] = None, secure: bool = False, extra_container_args: Optional[GeneralArgList] = None, @@ -2354,6 +2365,7 @@ class AlertManagerSpec(MonitoringSpec): # <webhook_configs> configuration. self.user_data = user_data or {} self.secure = secure + self.only_bind_port_on_networks = only_bind_port_on_networks def get_port_start(self) -> List[int]: return [self.get_port(), 9094] @@ -2400,7 +2412,7 @@ class GrafanaSpec(MonitoringSpec): self.protocol = protocol # whether ports daemons for this service bind to should - # bind to only hte networks listed in networks param, or + # bind to only the networks listed in networks param, or # to all networks. Defaults to false which is saying to bind # on all networks. self.only_bind_port_on_networks = only_bind_port_on_networks diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt index 3727c525ce7..41e473e23f0 100644 --- a/src/rgw/CMakeLists.txt +++ b/src/rgw/CMakeLists.txt @@ -487,9 +487,9 @@ target_link_libraries(radosgw PRIVATE install(TARGETS radosgw DESTINATION bin) set(radosgw_admin_srcs - rgw_admin.cc - rgw_sync_checkpoint.cc - rgw_orphan.cc) + radosgw-admin/radosgw-admin.cc + radosgw-admin/sync_checkpoint.cc + radosgw-admin/orphan.cc) # this is unsatisfying and hopefully temporary; ARROW should not be # part of radosgw_admin diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc index a87d88c4b85..92dd7afe2fb 100644 --- a/src/rgw/driver/daos/rgw_sal_daos.cc +++ b/src/rgw/driver/daos/rgw_sal_daos.cc @@ -858,8 +858,6 @@ bool DaosZone::is_writeable() { return true; } bool DaosZone::get_redirect_endpoint(std::string* endpoint) { return false; } -bool DaosZone::has_zonegroup_api(const std::string& api) const { return false; } - const std::string& DaosZone::get_current_period_id() { return current_period->get_id(); } diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h index e382fdb04ae..5515579a441 100644 --- a/src/rgw/driver/daos/rgw_sal_daos.h +++ b/src/rgw/driver/daos/rgw_sal_daos.h @@ -484,7 +484,6 @@ class DaosZone : public StoreZone { virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; diff --git a/src/rgw/driver/motr/rgw_sal_motr.cc b/src/rgw/driver/motr/rgw_sal_motr.cc index b999673ac18..463ea8c5b11 100644 --- a/src/rgw/driver/motr/rgw_sal_motr.cc +++ b/src/rgw/driver/motr/rgw_sal_motr.cc @@ -1111,11 +1111,6 @@ bool MotrZone::get_redirect_endpoint(std::string* endpoint) return false; } -bool MotrZone::has_zonegroup_api(const std::string& api) const -{ - return (zonegroup.group.api_name == api); -} - const std::string& MotrZone::get_current_period_id() { return current_period->get_id(); diff --git a/src/rgw/driver/motr/rgw_sal_motr.h b/src/rgw/driver/motr/rgw_sal_motr.h index f92074b9d94..0f99ae48e86 100644 --- a/src/rgw/driver/motr/rgw_sal_motr.h +++ b/src/rgw/driver/motr/rgw_sal_motr.h @@ -525,7 +525,6 @@ class MotrZone : public StoreZone { virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; } virtual const std::string& get_realm_name() { return realm->get_name(); } diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc index 1345468210f..9d76462baa0 100644 --- a/src/rgw/driver/posix/rgw_sal_posix.cc +++ b/src/rgw/driver/posix/rgw_sal_posix.cc @@ -2893,6 +2893,14 @@ int POSIXObject::copy_object(const ACLOwner& owner, return dobj->set_obj_attrs(dpp, &attrs, nullptr, y, rgw::sal::FLAG_LOG_OP); } +int POSIXObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) +{ + return -EOPNOTSUPP; +} + int POSIXObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh) { int ret = stat(dpp); diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h index 8ec72bbc1bc..bf3478ad6ab 100644 --- a/src/rgw/driver/posix/rgw_sal_posix.h +++ b/src/rgw/driver/posix/rgw_sal_posix.h @@ -653,6 +653,13 @@ public: const DoutPrefixProvider* dpp, optional_yield y) override; virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override; virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override; diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc index 792671579b7..c0a9059a251 100644 --- a/src/rgw/driver/rados/rgw_data_sync.cc +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -2617,6 +2617,7 @@ class RGWUserPermHandler { rgw::IAM::Environment env; std::unique_ptr<rgw::auth::Identity> identity; RGWAccessControlPolicy user_acl; + std::vector<rgw::IAM::Policy> user_policies; }; std::shared_ptr<_info> info; @@ -2644,7 +2645,7 @@ class RGWUserPermHandler { } auto result = rgw::auth::transform_old_authinfo( - sync_env->dpp, null_yield, sync_env->driver, user.get()); + sync_env->dpp, null_yield, sync_env->driver, user.get(), &info->user_policies); if (!result) { return result.error(); } @@ -2679,6 +2680,7 @@ public: std::shared_ptr<_info> info; RGWAccessControlPolicy bucket_acl; std::optional<perm_state> ps; + boost::optional<rgw::IAM::Policy> bucket_policy; public: Bucket() {} @@ -2686,9 +2688,7 @@ public: const RGWBucketInfo& bucket_info, const map<string, bufferlist>& bucket_attrs); - bool verify_bucket_permission(int perm); - bool verify_object_permission(const map<string, bufferlist>& obj_attrs, - int perm); + bool verify_bucket_permission(const rgw_obj_key& obj_key, const uint64_t op); }; static int policy_from_attrs(CephContext *cct, @@ -2728,6 +2728,14 @@ int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler, return r; } + // load bucket policy + try { + bucket_policy = get_iam_policy_from_attr(sync_env->cct, bucket_attrs, bucket_info.bucket.tenant); + } catch (const std::exception& e) { + ldpp_dout(sync_env->dpp, 0) << "ERROR: reading IAM Policy: " << e.what() << dendl; + return -EACCES; + } + ps.emplace(sync_env->cct, info->env, info->identity.get(), @@ -2740,36 +2748,40 @@ int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler, return 0; } -bool RGWUserPermHandler::Bucket::verify_bucket_permission(int perm) -{ - return verify_bucket_permission_no_policy(sync_env->dpp, - &(*ps), - info->user_acl, - bucket_acl, - perm); -} - -bool RGWUserPermHandler::Bucket::verify_object_permission(const map<string, bufferlist>& obj_attrs, - int perm) +bool RGWUserPermHandler::Bucket::verify_bucket_permission(const rgw_obj_key& obj_key, const uint64_t op) { - RGWAccessControlPolicy obj_acl; - - int r = policy_from_attrs(sync_env->cct, obj_attrs, &obj_acl); - if (r < 0) { - return r; - } - - return verify_bucket_permission_no_policy(sync_env->dpp, - &(*ps), - bucket_acl, - obj_acl, - perm); + const rgw_obj obj(ps->bucket_info.bucket, obj_key); + const auto arn = rgw::ARN(obj); + + if (ps->identity->get_account()) { + const bool account_root = (ps->identity->get_identity_type() == TYPE_ROOT); + if (!ps->identity->is_owner_of(bucket_acl.get_owner().id)) { + ldpp_dout(sync_env->dpp, 4) << "cross-account request for bucket owner " + << bucket_acl.get_owner().id << " != " << ps->identity->get_aclowner().id << dendl; + // cross-account requests evaluate the identity-based policies separately + // from the resource-based policies and require Allow from both + return ::verify_bucket_permission(sync_env->dpp, &(*ps), arn, account_root, {}, {}, {}, + info->user_policies, {}, op) + && ::verify_bucket_permission(sync_env->dpp, &(*ps), arn, false, info->user_acl, + bucket_acl, bucket_policy, {}, {}, op); + } else { + // don't consult acls for same-account access. require an Allow from + // either identity- or resource-based policy + return ::verify_bucket_permission(sync_env->dpp, &(*ps), arn, account_root, {}, {}, + bucket_policy, info->user_policies, + {}, op); + } + } + constexpr bool account_root = false; + return ::verify_bucket_permission(sync_env->dpp, &(*ps), arn, account_root, + info->user_acl, bucket_acl, + bucket_policy, info->user_policies, + {}, op); } class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default { rgw_bucket_sync_pipe sync_pipe; - std::shared_ptr<RGWUserPermHandler::Bucket> bucket_perms; std::optional<rgw_sync_pipe_dest_params> verify_dest_params; std::optional<ceph::real_time> mtime; @@ -2782,10 +2794,8 @@ class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default { public: RGWFetchObjFilter_Sync(rgw_bucket_sync_pipe& _sync_pipe, - std::shared_ptr<RGWUserPermHandler::Bucket>& _bucket_perms, std::optional<rgw_sync_pipe_dest_params>&& _verify_dest_params, std::shared_ptr<bool>& _need_retry) : sync_pipe(_sync_pipe), - bucket_perms(_bucket_perms), verify_dest_params(std::move(_verify_dest_params)), need_retry(_need_retry) { *need_retry = false; @@ -2852,12 +2862,6 @@ int RGWFetchObjFilter_Sync::filter(CephContext *cct, *poverride_owner = acl_translation_owner; } } - if (params.mode == rgw_sync_pipe_params::MODE_USER) { - if (!bucket_perms->verify_object_permission(obj_attrs, RGW_PERM_READ)) { - ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to fetch object" << dendl; - return -EPERM; - } - } if (!dest_placement_rule && params.dest.storage_class) { @@ -2900,7 +2904,6 @@ class RGWObjFetchCR : public RGWCoroutine { rgw_sync_pipe_params::Mode param_mode; std::optional<RGWUserPermHandler> user_perms; - std::shared_ptr<RGWUserPermHandler::Bucket> source_bucket_perms; RGWUserPermHandler::Bucket dest_bucket_perms; std::optional<rgw_sync_pipe_dest_params> dest_params; @@ -3016,20 +3019,10 @@ public: return set_cr_error(retcode); } - if (!dest_bucket_perms.verify_bucket_permission(RGW_PERM_WRITE)) { + if (!dest_bucket_perms.verify_bucket_permission(dest_key.value_or(key), rgw::IAM::s3PutObject)) { ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl; return -EPERM; } - - /* init source bucket permission structure */ - source_bucket_perms = make_shared<RGWUserPermHandler::Bucket>(); - r = user_perms->init_bucket(sync_pipe.source_bucket_info, - sync_pipe.source_bucket_attrs, - source_bucket_perms.get()); - if (r < 0) { - ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl; - return set_cr_error(retcode); - } } yield { @@ -3037,12 +3030,11 @@ public: need_retry = make_shared<bool>(); } auto filter = make_shared<RGWFetchObjFilter_Sync>(sync_pipe, - source_bucket_perms, std::move(dest_params), need_retry); call(new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->driver, sc->source_zone, - nullopt, + param_user, sync_pipe.source_bucket_info.bucket, std::nullopt, sync_pipe.dest_bucket_info, key, dest_key, versioned_epoch, @@ -4528,7 +4520,7 @@ public: } tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key)); } - if (retcode == -ERR_PRECONDITION_FAILED) { + if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM) { pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n", bs.bucket.name, key, zone_name); set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)"); diff --git a/src/rgw/driver/rados/rgw_datalog.cc b/src/rgw/driver/rados/rgw_datalog.cc index 4c9503071ef..d7e57d7e1c1 100644 --- a/src/rgw/driver/rados/rgw_datalog.cc +++ b/src/rgw/driver/rados/rgw_datalog.cc @@ -576,7 +576,7 @@ int RGWDataChangesLog::renew_entries(const DoutPrefixProvider *dpp) if (ret < 0) { /* we don't really need to have a special handling for failed cases here, * as this is just an optimization. */ - ldpp_dout(dpp, -1) << "ERROR: svc.cls->timelog.add() returned " << ret << dendl; + ldpp_dout(dpp, -1) << "ERROR: be->push() returned " << ret << dendl; return ret; } diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc index f18e8e46bc5..aacb9b6a09a 100644 --- a/src/rgw/driver/rados/rgw_period.cc +++ b/src/rgw/driver/rados/rgw_period.cc @@ -68,20 +68,6 @@ int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y) return ret; } -int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y) -{ - if (zonegroup.realm_id != realm_id) { - return 0; - } - int ret = period_map.update(zonegroup, cct); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl; - return ret; - } - - return store_info(dpp, false, y); -} - int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y) { auto zone_svc = sysobj_svc->get_zone_svc(); diff --git a/src/rgw/driver/rados/rgw_pubsub_push.cc b/src/rgw/driver/rados/rgw_pubsub_push.cc index 07d65fa1028..d22c61e9b08 100644 --- a/src/rgw/driver/rados/rgw_pubsub_push.cc +++ b/src/rgw/driver/rados/rgw_pubsub_push.cc @@ -281,7 +281,7 @@ public: conn_id, _endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), args.get_optional("ca-location"), args.get_optional("mechanism"), args.get_optional("user-name"), - args.get_optional("password"))) { + args.get_optional("password"), args.get_optional("kafka-brokers"))) { throw configuration_error("Kafka: failed to create connection to: " + _endpoint); } @@ -434,4 +434,3 @@ void RGWPubSubEndpoint::shutdown_all() { #endif shutdown_http_manager(); } - diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index 2ba3559c006..a183feabe2a 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -5485,7 +5485,7 @@ int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& ob } /* if the bucket is not synced we can remove the meta file */ - if (!svc.zone->is_syncing_bucket_meta(bucket)) { + if (!svc.zone->is_syncing_bucket_meta()) { RGWObjVersionTracker objv_tracker; r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, y, dpp); if (r < 0) { @@ -6962,13 +6962,13 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu } return 0; -} +} /* RGWRados::set_attrs() */ -static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, - RGWRados* store, RGWBucketInfo& bucket_info, - RGWObjectCtx* rctx, RGWObjManifest* manifest, - int part_num, int* parts_count, bool prefetch, - RGWObjState** pstate, RGWObjManifest** pmanifest) +int RGWRados::get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, + RGWRados* store, RGWBucketInfo& bucket_info, + RGWObjectCtx* rctx, RGWObjManifest* manifest, + int part_num, int* parts_count, bool prefetch, + RGWObjState** pstate, RGWObjManifest** pmanifest) { if (!manifest) { return -ERR_INVALID_PART; @@ -7047,6 +7047,9 @@ static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, // update the object size sm->state.size = part_manifest.get_obj_size(); + if (!sm->state.attrset.count(RGW_ATTR_COMPRESSION)) { + sm->state.accounted_size = sm->state.size; + } *pmanifest = &part_manifest; return 0; @@ -8948,7 +8951,7 @@ int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, return r; } - auto iter = attrset.find(RGW_ATTR_OLH_VER); + auto iter = attrset.find(RGW_ATTR_OLH_INFO); if (iter == attrset.end()) { /* not an olh */ return -EINVAL; } diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h index b24823b60dc..fe79916392f 100644 --- a/src/rgw/driver/rados/rgw_rados.h +++ b/src/rgw/driver/rados/rgw_rados.h @@ -1071,6 +1071,12 @@ public: }; // class RGWRados::Bucket::List }; // class RGWRados::Bucket + static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, + RGWRados* store, RGWBucketInfo& bucket_info, + RGWObjectCtx* rctx, RGWObjManifest* manifest, + int part_num, int* parts_count, bool prefetch, + RGWObjState** pstate, RGWObjManifest** pmanifest); + int on_last_entry_in_listing(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const std::string& obj_prefix, diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc index 88da446c3de..4c05421653b 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.cc +++ b/src/rgw/driver/rados/rgw_sal_rados.cc @@ -429,6 +429,10 @@ int RadosBucket::remove(const DoutPrefixProvider* dpp, ldpp_dout(dpp, -1) << "ERROR: unable to remove notifications from bucket. ret=" << ps_ret << dendl; } + if (ret = rgw::bucketlogging::bucket_deletion_cleanup(dpp, store, this, y); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: could not cleanup bucket logging configuration and pending objects, ret = " << ret << dendl; + } + ret = store->ctl()->bucket->unlink_bucket(rados, info.owner, info.bucket, y, dpp, false); if (ret < 0) { @@ -1024,15 +1028,15 @@ int RadosBucket::remove_topics(RGWObjVersionTracker* objv_tracker, objv_tracker, y); } -int RadosBucket::get_logging_object_name(std::string& obj_name, - const std::string& prefix, - optional_yield y, +int RadosBucket::get_logging_object_name(std::string& obj_name, + const std::string& prefix, + optional_yield y, const DoutPrefixProvider *dpp, RGWObjVersionTracker* objv_tracker) { rgw_pool data_pool; const auto obj_name_oid = bucketlogging::object_name_oid(this, prefix); if (!store->getRados()->get_obj_data_pool(get_placement_rule(), rgw_obj{get_key(), obj_name_oid}, &data_pool)) { - ldpp_dout(dpp, 1) << "failed to get data pool for bucket '" << get_name() << + ldpp_dout(dpp, 1) << "ERROR: failed to get data pool for bucket '" << get_name() << "' when getting logging object name" << dendl; return -EIO; } @@ -1048,23 +1052,23 @@ int RadosBucket::get_logging_object_name(std::string& obj_name, nullptr, nullptr); if (ret < 0) { - ldpp_dout(dpp, 1) << "failed to get logging object name from '" << obj_name_oid << "'. ret = " << ret << dendl; + ldpp_dout(dpp, 1) << "ERROR: failed to get logging object name from '" << obj_name_oid << "'. ret = " << ret << dendl; return ret; } obj_name = bl.to_str(); return 0; } -int RadosBucket::set_logging_object_name(const std::string& obj_name, - const std::string& prefix, - optional_yield y, - const DoutPrefixProvider *dpp, +int RadosBucket::set_logging_object_name(const std::string& obj_name, + const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, bool new_obj, RGWObjVersionTracker* objv_tracker) { rgw_pool data_pool; const auto obj_name_oid = bucketlogging::object_name_oid(this, prefix); if (!store->getRados()->get_obj_data_pool(get_placement_rule(), rgw_obj{get_key(), obj_name_oid}, &data_pool)) { - ldpp_dout(dpp, 1) << "failed to get data pool for bucket '" << get_name() << + ldpp_dout(dpp, 1) << "ERROR: failed to get data pool for bucket '" << get_name() << "' when setting logging object name" << dendl; return -EIO; } @@ -1080,28 +1084,65 @@ int RadosBucket::set_logging_object_name(const std::string& obj_name, y, nullptr); if (ret == -EEXIST) { - ldpp_dout(dpp, 20) << "race detected in initializing '" << obj_name_oid << "' with logging object name:'" << obj_name << "'. ret = " << ret << dendl; + ldpp_dout(dpp, 20) << "INFO: race detected in initializing '" << obj_name_oid << "' with logging object name:'" << obj_name << "'. ret = " << ret << dendl; } else if (ret == -ECANCELED) { - ldpp_dout(dpp, 20) << "race detected in updating logging object name '" << obj_name << "' at '" << obj_name_oid << "'. ret = " << ret << dendl; + ldpp_dout(dpp, 20) << "INFO: race detected in updating logging object name '" << obj_name << "' at '" << obj_name_oid << "'. ret = " << ret << dendl; } else if (ret < 0) { - ldpp_dout(dpp, 1) << "failed to set logging object name '" << obj_name << "' at '" << obj_name_oid << "'. ret = " << ret << dendl; + ldpp_dout(dpp, 1) << "ERROR: failed to set logging object name '" << obj_name << "' at '" << obj_name_oid << "'. ret = " << ret << dendl; } return ret; } +int RadosBucket::remove_logging_object_name(const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWObjVersionTracker* objv_tracker) { + rgw_pool data_pool; + const auto obj_name_oid = bucketlogging::object_name_oid(this, prefix); + if (!store->getRados()->get_obj_data_pool(get_placement_rule(), rgw_obj{get_key(), obj_name_oid}, &data_pool)) { + ldpp_dout(dpp, 1) << "ERROR: failed to get data pool for bucket '" << get_name() << + "' when setting logging object name" << dendl; + return -EIO; + } + return rgw_delete_system_obj(dpp, store->svc()->sysobj, + data_pool, + obj_name_oid, + objv_tracker, + y); +} + std::string to_temp_object_name(const rgw::sal::Bucket* bucket, const std::string& obj_name) { return fmt::format("{}__shadow_{}0", bucket->get_bucket_id(), obj_name); } +int RadosBucket::remove_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) { + rgw_pool data_pool; + const rgw_obj head_obj{get_key(), obj_name}; + const auto placement_rule = get_placement_rule(); + + if (!store->getRados()->get_obj_data_pool(placement_rule, head_obj, &data_pool)) { + ldpp_dout(dpp, 1) << "ERROR: failed to get data pool for bucket '" << get_name() << + "' when deleting logging object" << dendl; + return -EIO; + } + + const auto temp_obj_name = to_temp_object_name(this, obj_name); + return rgw_delete_system_obj(dpp, store->svc()->sysobj, + data_pool, + temp_obj_name, + nullptr, + y); +} + int RadosBucket::commit_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) { rgw_pool data_pool; const rgw_obj head_obj{get_key(), obj_name}; const auto placement_rule = get_placement_rule(); if (!store->getRados()->get_obj_data_pool(placement_rule, head_obj, &data_pool)) { - ldpp_dout(dpp, 1) << "failed to get data pool for bucket '" << get_name() << + ldpp_dout(dpp, 1) << "ERROR: failed to get data pool for bucket '" << get_name() << "' when comitting logging object" << dendl; return -EIO; } @@ -1110,7 +1151,6 @@ int RadosBucket::commit_logging_object(const std::string& obj_name, optional_yie std::map<string, bufferlist> obj_attrs; ceph::real_time mtime; bufferlist bl_data; - // TODO: this is needed only for etag calculation if (const auto ret = rgw_get_system_obj(store->svc()->sysobj, data_pool, temp_obj_name, @@ -1120,10 +1160,13 @@ int RadosBucket::commit_logging_object(const std::string& obj_name, optional_yie y, dpp, &obj_attrs, - nullptr); ret < 0) { - ldpp_dout(dpp, 1) << "faild to read logging data when comitting to object '" << temp_obj_name + nullptr); ret < 0 && ret != -ENOENT) { + ldpp_dout(dpp, 1) << "ERROR: failed to read logging data when comitting object '" << temp_obj_name << ". error: " << ret << dendl; return ret; + } else if (ret == -ENOENT) { + ldpp_dout(dpp, 1) << "WARNING: temporary logging object '" << temp_obj_name << "' does not exists" << dendl; + return 0; } uint64_t size = bl_data.length(); @@ -1137,13 +1180,13 @@ int RadosBucket::commit_logging_object(const std::string& obj_name, optional_yie nullptr, // no special placment for tail get_key(), head_obj); ret < 0) { - ldpp_dout(dpp, 1) << "failed to create manifest when comitting logging object. error: " << + ldpp_dout(dpp, 1) << "ERROR: failed to create manifest when comitting logging object. error: " << ret << dendl; return ret; } if (const auto ret = manifest_gen.create_next(size); ret < 0) { - ldpp_dout(dpp, 1) << "failed to add object to manifest when comitting logging object. error: " << + ldpp_dout(dpp, 1) << "ERROR: failed to add object to manifest when comitting logging object. error: " << ret << dendl; return ret; } @@ -1151,7 +1194,7 @@ int RadosBucket::commit_logging_object(const std::string& obj_name, optional_yie if (const auto expected_temp_obj = manifest_gen.get_cur_obj(store->getRados()); temp_obj_name != expected_temp_obj.oid) { // TODO: cleanup temporary object, commit would never succeed - ldpp_dout(dpp, 1) << "temporary logging object name mismatch: '" << + ldpp_dout(dpp, 1) << "ERROR: temporary logging object name mismatch: '" << temp_obj_name << "' != '" << expected_temp_obj.oid << "'" << dendl; return -EINVAL; } @@ -1182,11 +1225,11 @@ int RadosBucket::commit_logging_object(const std::string& obj_name, optional_yie const req_context rctx{dpp, y, nullptr}; jspan_context trace{false, false}; if (const auto ret = head_obj_wop.write_meta(0, size, obj_attrs, rctx, trace); ret < 0) { - ldpp_dout(dpp, 1) << "failed to commit logging object '" << temp_obj_name << - "' to bucket id '" << get_bucket_id() <<"'. error: " << ret << dendl; + ldpp_dout(dpp, 1) << "ERROR: failed to commit logging object '" << temp_obj_name << + "' to bucket id '" << get_info().bucket <<"'. error: " << ret << dendl; return ret; } - ldpp_dout(dpp, 20) << "committed logging object '" << temp_obj_name << + ldpp_dout(dpp, 20) << "INFO: committed logging object '" << temp_obj_name << "' with size of " << size << " bytes, to bucket '" << get_key() << "' as '" << obj_name << "'" << dendl; return 0; @@ -1204,30 +1247,30 @@ void bucket_logging_completion(rados_completion_t completion, void* args) { auto* aio_comp = reinterpret_cast<librados::AioCompletionImpl*>(completion); std::unique_ptr<BucketLoggingCompleteArg> logging_args(reinterpret_cast<BucketLoggingCompleteArg*>(args)); if (aio_comp->get_return_value() < 0) { - ldout(logging_args->cct, 1) << "failed to complete append to logging object '" << logging_args->obj_name << + ldout(logging_args->cct, 1) << "ERROR: failed to complete append to logging object '" << logging_args->obj_name << "'. ret = " << aio_comp->get_return_value() << dendl; } else { - ldout(logging_args->cct, 20) << "wrote " << logging_args->size << " bytes to logging object '" << + ldout(logging_args->cct, 20) << "INFO: wrote " << logging_args->size << " bytes to logging object '" << logging_args->obj_name << "'" << dendl; } } -int RadosBucket::write_logging_object(const std::string& obj_name, - const std::string& record, - optional_yield y, +int RadosBucket::write_logging_object(const std::string& obj_name, + const std::string& record, + optional_yield y, const DoutPrefixProvider *dpp, bool async_completion) { const auto temp_obj_name = to_temp_object_name(this, obj_name); rgw_pool data_pool; rgw_obj obj{get_key(), obj_name}; if (!store->getRados()->get_obj_data_pool(get_placement_rule(), obj, &data_pool)) { - ldpp_dout(dpp, 1) << "failed to get data pool for bucket '" << get_name() << + ldpp_dout(dpp, 1) << "ERROR: failed to get data pool for bucket '" << get_name() << "' when writing logging object" << dendl; return -EIO; } librados::IoCtx io_ctx; if (const auto ret = rgw_init_ioctx(dpp, store->getRados()->get_rados_handle(), data_pool, io_ctx); ret < 0) { - ldpp_dout(dpp, 1) << "failed to get IO context for logging object from data pool:" << data_pool.to_str() << dendl; + ldpp_dout(dpp, 1) << "ERROR: failed to get IO context for logging object from data pool:" << data_pool.to_str() << dendl; return -EIO; } bufferlist bl; @@ -1242,7 +1285,7 @@ int RadosBucket::write_logging_object(const std::string& obj_name, auto arg = std::make_unique<BucketLoggingCompleteArg>(temp_obj_name, record.length(), store->ctx()); completion->set_complete_callback(arg.get(), bucket_logging_completion); if (const auto ret = io_ctx.aio_operate(temp_obj_name, completion.get(), &op); ret < 0) { - ldpp_dout(dpp, 1) << "failed to append to logging object '" << temp_obj_name << + ldpp_dout(dpp, 1) << "ERROR: failed to append to logging object '" << temp_obj_name << "'. ret = " << ret << dendl; return ret; } @@ -1251,11 +1294,11 @@ int RadosBucket::write_logging_object(const std::string& obj_name, return 0; } if (const auto ret = rgw_rados_operate(dpp, io_ctx, temp_obj_name, &op, y); ret < 0) { - ldpp_dout(dpp, 1) << "failed to append to logging object '" << temp_obj_name << + ldpp_dout(dpp, 1) << "ERROR: failed to append to logging object '" << temp_obj_name << "'. ret = " << ret << dendl; return ret; } - ldpp_dout(dpp, 20) << "wrote " << record.length() << " bytes to logging object '" << + ldpp_dout(dpp, 20) << "INFO: wrote " << record.length() << " bytes to logging object '" << temp_obj_name << "'" << dendl; return 0; } @@ -2471,7 +2514,108 @@ bool RadosObject::is_sync_completed(const DoutPrefixProvider* dpp, const rgw_bi_log_entry& earliest_marker = entries.front(); return earliest_marker.timestamp > obj_mtime; -} +} /* is_sync_completed */ + +int RadosObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) +{ + int ret{0}; + + /* require an object with a manifest, so call to get_obj_state() must precede this */ + if (! manifest) { + return -EINVAL; + } + + RGWObjManifest::obj_iterator end = manifest->obj_end(dpp); + if (end.get_cur_part_id() == 0) { // not multipart + ldpp_dout(dpp, 20) << __func__ << " object does not have a multipart manifest" + << dendl; + return 0; + } + + auto end_part_id = end.get_cur_part_id(); + auto parts_count = (end_part_id == 1) ? 1 : end_part_id - 1; + if (marker > (parts_count - 1)) { + return 0; + } + + RGWObjManifest::obj_iterator part_iter = manifest->obj_begin(dpp); + + if (marker != 0) { + ldpp_dout_fmt(dpp, 20, + "{} seeking to part #{} in the object manifest", + __func__, marker); + + part_iter = manifest->obj_find_part(dpp, marker + 1); + + if (part_iter == end) { + ldpp_dout_fmt(dpp, 5, + "{} failed to find part #{} in the object manifest", + __func__, marker + 1); + return 0; + } + } + + RGWObjectCtx& obj_ctx = get_ctx(); + RGWBucketInfo& bucket_info = get_bucket()->get_info(); + + Object::Part obj_part{}; + for (; part_iter != manifest->obj_end(dpp); ++part_iter) { + + /* we're only interested in the first object in each logical part */ + auto cur_part_id = part_iter.get_cur_part_id(); + if (cur_part_id == obj_part.part_number) { + continue; + } + + if (max_parts < 1) { + *truncated = true; + break; + } + + /* get_part_obj_state alters the passed manifest** to point to a part + * manifest, which we don't want to leak out here */ + RGWObjManifest* obj_m = manifest; + RGWObjState* astate; + bool part_prefetch = false; + ret = RGWRados::get_part_obj_state(dpp, y, store->getRados(), bucket_info, &obj_ctx, + obj_m, cur_part_id, &parts_count, + part_prefetch, &astate, &obj_m); + + if (ret < 0) { + ldpp_dout_fmt(dpp, 4, + "{} get_part_obj_state() failed ret={}", + __func__, ret); + break; + } + + obj_part.part_number = part_iter.get_cur_part_id(); + obj_part.part_size = astate->accounted_size; + + if (auto iter = astate->attrset.find(RGW_ATTR_CKSUM); + iter != astate->attrset.end()) { + try { + rgw::cksum::Cksum part_cksum; + auto ck_iter = iter->second.cbegin(); + part_cksum.decode(ck_iter); + obj_part.cksum = std::move(part_cksum); + } catch (buffer::error& err) { + ldpp_dout_fmt(dpp, 4, + "WARN: {} could not decode stored cksum, " + "caught buffer::error", + __func__); + } + } + + each_func(obj_part); + *next_marker = ++marker; + --max_parts; + } /* each part */ + + return ret; +} /* RadosObject::list_parts */ int RadosObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh) { @@ -4500,11 +4644,6 @@ bool RadosZone::get_redirect_endpoint(std::string* endpoint) return true; } -bool RadosZone::has_zonegroup_api(const std::string& api) const -{ - return store->svc()->zone->has_zonegroup_api(api); -} - const std::string& RadosZone::get_current_period_id() { return store->svc()->zone->get_current_period_id(); diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h index 23d81a934b0..e65c3c0050e 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.h +++ b/src/rgw/driver/rados/rgw_sal_rados.h @@ -107,7 +107,6 @@ class RadosZone : public StoreZone { virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() override; virtual const std::string& get_realm_name() override; @@ -593,12 +592,18 @@ class RadosObject : public StoreObject { StoreObject::set_compressed(); } - virtual bool is_sync_completed(const DoutPrefixProvider* dpp, const ceph::real_time& obj_mtime) override; /* For rgw_admin.cc */ RGWObjState& get_state() { return state; } virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override; + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override; virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; @@ -775,18 +780,23 @@ class RadosBucket : public StoreBucket { optional_yield y, const DoutPrefixProvider *dpp) override; int remove_topics(RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override; - int get_logging_object_name(std::string& obj_name, - const std::string& prefix, - optional_yield y, - const DoutPrefixProvider *dpp, + int get_logging_object_name(std::string& obj_name, + const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWObjVersionTracker* objv_tracker) override; + int set_logging_object_name(const std::string& obj_name, + const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, + bool new_obj, RGWObjVersionTracker* objv_tracker) override; - int set_logging_object_name(const std::string& obj_name, - const std::string& prefix, - optional_yield y, - const DoutPrefixProvider *dpp, - bool new_obj, + int remove_logging_object_name(const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, RGWObjVersionTracker* objv_tracker) override; int commit_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) override; + int remove_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) override; int write_logging_object(const std::string& obj_name, const std::string& record, optional_yield y, const DoutPrefixProvider *dpp, bool async_completion) override; private: diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc index 79d2be0bcfa..bf7a309e864 100644 --- a/src/rgw/driver/rados/rgw_tools.cc +++ b/src/rgw/driver/rados/rgw_tools.cc @@ -339,21 +339,35 @@ int rgw_list_pool(const DoutPrefixProvider *dpp, ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl; return -EINVAL; } - - auto iter = ioctx.nobjects_begin(oc); + librados::NObjectIterator iter; + try { + iter = ioctx.nobjects_begin(oc); + } catch (const std::system_error& e) { + ldpp_dout(dpp, 1) << "rgw_list_pool: Failed to begin iteration of pool " + << ioctx.get_pool_name() << " with error " + << e.what() << dendl; + return ceph::from_error_code(e.code()); + } /// Pool_iterate if (iter == ioctx.nobjects_end()) return -ENOENT; - for (; oids->size() < max && iter != ioctx.nobjects_end(); ++iter) { - string oid = iter->get_oid(); - ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + try { + for (; oids->size() < max && iter != ioctx.nobjects_end(); ++iter) { + string oid = iter->get_oid(); + ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; - // fill it in with initial values; we may correct later - if (filter && !filter(oid, oid)) - continue; + // fill it in with initial values; we may correct later + if (filter && !filter(oid, oid)) + continue; - oids->push_back(oid); + oids->push_back(oid); + } + } catch (const std::system_error& e) { + ldpp_dout(dpp, 1) << "rgw_list_pool: Failed iterating pool " + << ioctx.get_pool_name() << " with error " + << e.what() << dendl; + return ceph::from_error_code(e.code()); } marker = iter.get_cursor().to_str(); diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc index 894d8e40950..cce593c6bd5 100644 --- a/src/rgw/driver/rados/rgw_user.cc +++ b/src/rgw/driver/rados/rgw_user.cc @@ -189,6 +189,11 @@ static void dump_user_info(Formatter *f, RGWUserInfo &info, } encode_json("type", user_source_type, f); encode_json("mfa_ids", info.mfa_ids, f); + encode_json("account_id", info.account_id, f); + encode_json("path", info.path, f); + encode_json("create_date", info.create_date, f); + encode_json("tags", info.tags, f); + encode_json("group_ids", info.group_ids, f); if (stats) { encode_json("stats", *stats, f); } diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h index ab157f38e39..4ae7d13eff7 100644 --- a/src/rgw/driver/rados/rgw_user.h +++ b/src/rgw/driver/rados/rgw_user.h @@ -19,11 +19,11 @@ #define RGW_USER_ANON_ID "anonymous" -#define SECRET_KEY_LEN 40 -#define PUBLIC_ID_LEN 20 -#define RAND_SUBUSER_LEN 5 +constexpr auto SECRET_KEY_LEN=40; +constexpr auto PUBLIC_ID_LEN=20; +constexpr auto RAND_SUBUSER_LEN=5; -#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/" +constexpr auto XMLNS_AWS_S3 = "http://s3.amazonaws.com/doc/2006-03-01/"; class RGWUserCtl; class RGWBucketCtl; diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h index c542abc76d6..5fb2b4b8096 100644 --- a/src/rgw/driver/rados/rgw_zone.h +++ b/src/rgw/driver/rados/rgw_zone.h @@ -769,7 +769,6 @@ public: int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true); int delete_obj(const DoutPrefixProvider *dpp, optional_yield y); int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); - int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y); void fork(); int update(const DoutPrefixProvider *dpp, optional_yield y); diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/radosgw-admin/orphan.cc index b7dc562c721..9fca3b99a7c 100644 --- a/src/rgw/rgw_orphan.cc +++ b/src/rgw/radosgw-admin/orphan.cc @@ -1,6 +1,12 @@ + +/* + * Copyright (C) 2024 IBM +*/ + // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp +#include "radosgw-admin/orphan.h" #include <string> @@ -10,7 +16,6 @@ #include "rgw_op.h" #include "rgw_multi.h" -#include "rgw_orphan.h" #include "rgw_zone.h" #include "rgw_bucket.h" #include "rgw_sal_rados.h" diff --git a/src/rgw/rgw_orphan.h b/src/rgw/radosgw-admin/orphan.h index db811d31d9a..db811d31d9a 100644 --- a/src/rgw/rgw_orphan.h +++ b/src/rgw/radosgw-admin/orphan.h diff --git a/src/rgw/rgw_admin.cc b/src/rgw/radosgw-admin/radosgw-admin.cc index 95a7af6a0fa..13936c87952 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/radosgw-admin/radosgw-admin.cc @@ -1,12 +1,15 @@ +/* + * Copyright (C) 2025 IBM +*/ + // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#include <errno.h> -#include <iostream> -#include <sstream> +#include <cerrno> #include <string> - -#include <boost/optional.hpp> +#include <sstream> +#include <optional> +#include <iostream> extern "C" { #include <liboath/oath.h> @@ -38,6 +41,9 @@ extern "C" { #include "include/utime.h" #include "include/str_list.h" +#include "radosgw-admin/orphan.h" +#include "radosgw-admin/sync_checkpoint.h" + #include "rgw_user.h" #include "rgw_otp.h" #include "rgw_rados.h" @@ -48,7 +54,6 @@ extern "C" { #include "rgw_log.h" #include "rgw_formats.h" #include "rgw_usage.h" -#include "rgw_orphan.h" #include "rgw_sync.h" #include "rgw_trim_bilog.h" #include "rgw_trim_datalog.h" @@ -62,7 +67,6 @@ extern "C" { #include "rgw_zone.h" #include "rgw_pubsub.h" #include "rgw_bucket_sync.h" -#include "rgw_sync_checkpoint.h" #include "rgw_lua.h" #include "rgw_sal.h" #include "rgw_sal_config.h" @@ -82,11 +86,6 @@ extern "C" { #define dout_context g_ceph_context -#define SECRET_KEY_LEN 40 -#define PUBLIC_ID_LEN 20 - -using namespace std; - static rgw::sal::Driver* driver = NULL; static constexpr auto dout_subsys = ceph_subsys_rgw; @@ -117,19 +116,13 @@ static const DoutPrefixProvider* dpp() { } \ } while (0) -static inline int posix_errortrans(int r) +using namespace std; + +inline int posix_errortrans(int r) { - switch(r) { - case ERR_NO_SUCH_BUCKET: - r = ENOENT; - break; - default: - break; - } - return r; + return ERR_NO_SUCH_BUCKET == r ? ENOENT : r; } - static const std::string LUA_CONTEXT_LIST("prerequest, postrequest, background, getdata, putdata"); void usage() @@ -178,7 +171,8 @@ void usage() cout << " bucket sync disable disable bucket sync\n"; cout << " bucket sync enable enable bucket sync\n"; cout << " bucket radoslist list rados objects backing bucket's objects\n"; - cout << " bucket logging flush flush pending log records object of source bucket to the log bucket to bucket\n"; + cout << " bucket logging flush flush pending log records object of source bucket to the log bucket\n"; + cout << " bucket logging info get info on bucket logging configuration on source bucket or list of sources in log bucket\n"; cout << " bi get retrieve bucket index object entries\n"; cout << " bi put store bucket index object entries\n"; cout << " bi list list raw bucket index entries\n"; @@ -361,6 +355,7 @@ void usage() cout << " --secret/--secret-key=<key> specify secret key\n"; cout << " --gen-access-key generate random access key (for S3)\n"; cout << " --gen-secret generate random secret key\n"; + cout << " --generate-key create user with or without credentials\n"; cout << " --key-type=<type> key type, options are: swift, s3\n"; cout << " --key-active=<bool> activate or deactivate a key\n"; cout << " --temp-url-key[-2]=<key> temp url key\n"; @@ -707,6 +702,7 @@ enum class OPT { BUCKET_OBJECT_SHARD, BUCKET_RESYNC_ENCRYPTED_MULTIPART, BUCKET_LOGGING_FLUSH, + BUCKET_LOGGING_INFO, POLICY, LOG_LIST, LOG_SHOW, @@ -946,6 +942,7 @@ static SimpleCmd::Commands all_cmds = { { "bucket object shard", OPT::BUCKET_OBJECT_SHARD }, { "bucket resync encrypted multipart", OPT::BUCKET_RESYNC_ENCRYPTED_MULTIPART }, { "bucket logging flush", OPT::BUCKET_LOGGING_FLUSH }, + { "bucket logging info", OPT::BUCKET_LOGGING_INFO }, { "policy", OPT::POLICY }, { "log list", OPT::LOG_LIST }, { "log show", OPT::LOG_SHOW }, @@ -1271,7 +1268,7 @@ static int read_input(const string& infile, bufferlist& bl) } } -#define READ_CHUNK 8196 + constexpr auto READ_CHUNK=8196; int r; int err; @@ -2546,8 +2543,8 @@ static void sync_status(Formatter *formatter) struct indented { int w; // indent width - std::string_view header; - indented(int w, std::string_view header = "") : w(w), header(header) {} + std::string header; + indented(int w, std::string header = "") : w(w), header(header) {} }; std::ostream& operator<<(std::ostream& out, const indented& h) { return out << std::setw(h.w) << h.header << std::setw(1) << ' '; @@ -2555,10 +2552,10 @@ std::ostream& operator<<(std::ostream& out, const indented& h) { struct bucket_source_sync_info { const RGWZone& _source; - std::string_view error; + std::string error; std::map<int,std::string> shards_behind; int total_shards; - std::string_view status; + std::string status; rgw_bucket bucket_source; bucket_source_sync_info(const RGWZone& source): _source(source) {} @@ -3078,14 +3075,12 @@ static int bucket_sync_status(rgw::sal::Driver* driver, const RGWBucketInfo& inf } if (pipe.source.zone.value_or(rgw_zone_id()) == z->second.id) { bucket_source_sync_info source_sync_info(z->second); - auto ret = bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second, + bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second, c->second, info, pipe, source_sync_info); - if (ret == 0) { - bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info)); - } + bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info)); } } } @@ -3556,6 +3551,13 @@ int main(int argc, const char **argv) OPT opt_cmd = OPT::NO_CMD; int gen_access_key = 0; int gen_secret_key = 0; + enum generate_key_enum { + OPTION_SET_FALSE = 0, + OPTION_SET_TRUE = 1, + OPTION_NOT_SET = 2, + }; + + generate_key_enum generate_key = OPTION_NOT_SET; bool set_perm = false; bool set_temp_url_key = false; map<int, string> temp_url_keys; @@ -3837,6 +3839,17 @@ int main(int argc, const char **argv) cerr << "bad key type: " << key_type_str << std::endl; exit(1); } + } else if (ceph_argparse_witharg(args, i, &val, "--generate-key", (char*)NULL)) { + key_type_str = val; + if (key_type_str.compare("true") == 0) { + generate_key = OPTION_SET_TRUE; + } else if(key_type_str.compare("false") == 0) { + generate_key = OPTION_SET_FALSE; + } else { + cerr << "wrong value for --generate-key: " << key_type_str << " please specify either true or false" << std::endl; + exit(1); + } + // do nothing } else if (ceph_argparse_binary_flag(args, i, &key_active, NULL, "--key-active", (char*)NULL)) { key_active_specified = true; } else if (ceph_argparse_witharg(args, i, &val, "--job-id", (char*)NULL)) { @@ -4600,14 +4613,21 @@ int main(int argc, const char **argv) } /* check key parameter conflict */ - if ((!access_key.empty()) && gen_access_key) { - cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl; + if ((!access_key.empty()) && (gen_access_key || generate_key == OPTION_SET_TRUE)) { + cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key/generate-key" << std::endl; return EINVAL; } - if ((!secret_key.empty()) && gen_secret_key) { - cerr << "ERROR: key parameter conflict, --secret & --gen-secret" << std::endl; + if ((!secret_key.empty()) && (gen_secret_key || generate_key == OPTION_SET_TRUE)) { + cerr << "ERROR: key parameter conflict, --secret & --gen-secret/generate-key" << std::endl; return EINVAL; } + if (generate_key == OPTION_SET_FALSE) { + if ((!access_key.empty()) || gen_access_key || (!secret_key.empty()) || gen_secret_key) { + cerr << "ERROR: key parameter conflict, if --generate-key is not set so no other key parameters can be set" << std::endl; + return EINVAL; + } + } + } // default to pretty json @@ -6772,7 +6792,7 @@ int main(int argc, const char **argv) } break; case OPT::USER_CREATE: - if (!user_op.has_existing_user()) { + if (!user_op.has_existing_user() && (generate_key != OPTION_SET_FALSE)) { user_op.set_generate_key(); // generate a new key by default } ret = ruser.add(dpp(), user_op, null_yield, &err_msg); @@ -7731,6 +7751,47 @@ int main(int argc, const char **argv) return 0; } + if (opt_cmd == OPT::BUCKET_LOGGING_INFO) { + if (bucket_name.empty()) { + cerr << "ERROR: bucket not specified" << std::endl; + return EINVAL; + } + int ret = init_bucket(tenant, bucket_name, bucket_id, &bucket); + if (ret < 0) { + return -ret; + } + const auto& bucket_attrs = bucket->get_attrs(); + auto iter = bucket_attrs.find(RGW_ATTR_BUCKET_LOGGING); + if (iter != bucket_attrs.end()) { + rgw::bucketlogging::configuration configuration; + try { + configuration.enabled = true; + decode(configuration, iter->second); + } catch (buffer::error& err) { + cerr << "ERROR: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING + << "'. error: " << err.what() << std::endl; + return EINVAL; + } + encode_json("logging", configuration, formatter.get()); + formatter->flush(cout); + } + iter = bucket_attrs.find(RGW_ATTR_BUCKET_LOGGING_SOURCES); + if (iter != bucket_attrs.end()) { + rgw::bucketlogging::source_buckets sources; + try { + decode(sources, iter->second); + } catch (buffer::error& err) { + cerr << "ERROR: failed to decode logging sources attribute '" << RGW_ATTR_BUCKET_LOGGING_SOURCES + << "'. error: " << err.what() << std::endl; + return EINVAL; + } + encode_json("logging_sources", sources, formatter.get()); + formatter->flush(cout); + } + + return 0; + } + if (opt_cmd == OPT::LOG_LIST) { // filter by date? if (date.size() && date.size() != 10) { diff --git a/src/rgw/rgw_sync_checkpoint.cc b/src/rgw/radosgw-admin/sync_checkpoint.cc index 1172e79a48f..0303ed6c747 100644 --- a/src/rgw/rgw_sync_checkpoint.cc +++ b/src/rgw/radosgw-admin/sync_checkpoint.cc @@ -5,6 +5,7 @@ * Ceph - scalable distributed file system * * Copyright (C) 2020 Red Hat, Inc. + * Copyright (C) 2024 IBM * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -13,9 +14,12 @@ * */ +#include "radosgw-admin/sync_checkpoint.h" + #include <fmt/format.h> + #include "common/errno.h" -#include "rgw_sync_checkpoint.h" + #include "rgw_sal_rados.h" #include "rgw_bucket_sync.h" #include "rgw_data_sync.h" diff --git a/src/rgw/rgw_sync_checkpoint.h b/src/rgw/radosgw-admin/sync_checkpoint.h index 28df68d8860..28df68d8860 100644 --- a/src/rgw/rgw_sync_checkpoint.h +++ b/src/rgw/radosgw-admin/sync_checkpoint.h diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc index ec2a2079622..a0b494eb9c5 100644 --- a/src/rgw/rgw_auth.cc +++ b/src/rgw/rgw_auth.cc @@ -188,7 +188,8 @@ int load_account_and_policies(const DoutPrefixProvider* dpp, static auto transform_old_authinfo(const RGWUserInfo& user, std::optional<RGWAccountInfo> account, - std::vector<IAM::Policy> policies) + std::vector<IAM::Policy> policies, + sal::Driver* driver) -> std::unique_ptr<rgw::auth::Identity> { /* This class is not intended for public use. Should be removed altogether @@ -198,6 +199,7 @@ static auto transform_old_authinfo(const RGWUserInfo& user, /* For this particular case it's OK to use rgw_user structure to convey * the identity info as this was the policy for doing that before the * new auth. */ + sal::Driver* driver; const rgw_user id; const std::string display_name; const std::string path; @@ -208,8 +210,10 @@ static auto transform_old_authinfo(const RGWUserInfo& user, public: DummyIdentityApplier(const RGWUserInfo& user, std::optional<RGWAccountInfo> account, - std::vector<IAM::Policy> policies) - : id(user.user_id), + std::vector<IAM::Policy> policies, + sal::Driver* driver) + : driver(driver), + id(user.user_id), display_name(user.display_name), path(user.path), is_admin(user.admin), @@ -294,9 +298,9 @@ static auto transform_old_authinfo(const RGWUserInfo& user, << ", is_admin=" << is_admin << ")"; } - void load_acct_info(const DoutPrefixProvider* dpp, - RGWUserInfo& user_info) const override { + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override { // noop, this user info was passed in on construction + return driver->get_user(id); } void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const { @@ -307,13 +311,14 @@ static auto transform_old_authinfo(const RGWUserInfo& user, }; return std::make_unique<DummyIdentityApplier>( - user, std::move(account), std::move(policies)); + user, std::move(account), std::move(policies), driver); } auto transform_old_authinfo(const DoutPrefixProvider* dpp, optional_yield y, sal::Driver* driver, - sal::User* user) + sal::User* user, + std::vector<IAM::Policy>* policies_) -> tl::expected<std::unique_ptr<Identity>, int> { const RGWUserInfo& info = user->get_info(); @@ -328,7 +333,10 @@ auto transform_old_authinfo(const DoutPrefixProvider* dpp, return tl::unexpected(r); } - return transform_old_authinfo(info, std::move(account), std::move(policies)); + if (policies_) { // return policies to caller if requested + *policies_ = policies; + } + return transform_old_authinfo(info, std::move(account), std::move(policies), driver); } } /* namespace auth */ @@ -523,7 +531,7 @@ rgw::auth::Strategy::apply(const DoutPrefixProvider *dpp, const rgw::auth::Strat /* Account used by a given RGWOp is decoupled from identity employed * in the authorization phase (RGWOp::verify_permissions). */ - applier->load_acct_info(dpp, s->user->get_info()); + s->user = applier->load_acct_info(dpp); s->perm_mask = applier->get_perm_mask(); /* This is the single place where we pass req_state as a pointer @@ -631,36 +639,36 @@ void rgw::auth::WebIdentityApplier::create_account(const DoutPrefixProvider* dpp user_info = user->get_info(); } -void rgw::auth::WebIdentityApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const { +auto rgw::auth::WebIdentityApplier::load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> { rgw_user federated_user; federated_user.id = this->sub; federated_user.tenant = role_tenant; federated_user.ns = "oidc"; + std::unique_ptr<rgw::sal::User> user = driver->get_user(federated_user); if (account) { // we don't need shadow users for account roles because bucket ownership, // quota, and stats are tracked by the account instead of the user - user_info.user_id = std::move(federated_user); + RGWUserInfo& user_info = user->get_info(); user_info.display_name = user_name; user_info.type = TYPE_WEB; - return; + // the user_info.user_id is initialized by driver->get_user(...) + return user; } - std::unique_ptr<rgw::sal::User> user = driver->get_user(federated_user); - //Check in oidc namespace if (user->load_user(dpp, null_yield) >= 0) { /* Succeeded. */ - user_info = user->get_info(); - return; + // the user_info in user is initialized by user->load_user(...) + return user; } user->clear_ns(); //Check for old users which wouldn't have been created in oidc namespace if (user->load_user(dpp, null_yield) >= 0) { /* Succeeded. */ - user_info = user->get_info(); - return; + // the user_info in user is initialized by user->load_user(...) + return user; } //Check if user_id.buckets already exists, may have been from the time, when shadow users didnt exist @@ -671,7 +679,7 @@ void rgw::auth::WebIdentityApplier::load_acct_info(const DoutPrefixProvider* dpp last_synced, last_updated); if (ret < 0 && ret != -ENOENT) { ldpp_dout(dpp, 0) << "ERROR: reading stats for the user returned error " << ret << dendl; - return; + return user; } if (ret == -ENOENT) { /* in case of ENOENT, which means user doesnt have buckets */ //In this case user will be created in oidc namespace @@ -684,7 +692,8 @@ void rgw::auth::WebIdentityApplier::load_acct_info(const DoutPrefixProvider* dpp } ldpp_dout(dpp, 0) << "NOTICE: couldn't map oidc federated user " << federated_user << dendl; - create_account(dpp, federated_user, this->user_name, user_info); + create_account(dpp, federated_user, this->user_name, user->get_info()); + return user; } void rgw::auth::WebIdentityApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const @@ -936,7 +945,7 @@ void rgw::auth::RemoteApplier::write_ops_log_entry(rgw_log_entry& entry) const } /* TODO(rzarzynski): we need to handle display_name changes. */ -void rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +auto rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> /* out */ { /* It's supposed that RGWRemoteAuthApplier tries to load account info * that belongs to the authenticated identity. Another policy may be @@ -975,9 +984,9 @@ void rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp, RGW (void) load_account_and_policies(dpp, null_yield, driver, user->get_info(), user->get_attrs(), account, policies); - user_info = std::move(user->get_info()); owner_acct_user = std::move(tenanted_uid); - return; + // the user_info in user is initialized by user->load_user(...) + return user; } } @@ -990,15 +999,16 @@ void rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp, RGW (void) load_account_and_policies(dpp, null_yield, driver, user->get_info(), user->get_attrs(), account, policies); - user_info = std::move(user->get_info()); owner_acct_user = acct_user; - return; + // the user_info in user is initialized by user->load_user(...) + return user; } ldpp_dout(dpp, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl; - create_account(dpp, acct_user, implicit_tenant, user_info); + create_account(dpp, acct_user, implicit_tenant, user->get_info()); /* Succeeded if we are here (create_account() hasn't throwed). */ + return user; } void rgw::auth::RemoteApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const @@ -1098,11 +1108,11 @@ uint32_t rgw::auth::LocalApplier::get_perm_mask(const std::string& subuser_name, } } -void rgw::auth::LocalApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +auto rgw::auth::LocalApplier::load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> /* out */ { /* Load the account that belongs to the authenticated identity. An extra call * to RADOS may be safely skipped in this case. */ - user_info = this->user_info; + return std::unique_ptr<rgw::sal::User>(user.release()); } void rgw::auth::LocalApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const @@ -1121,6 +1131,22 @@ void rgw::auth::LocalApplier::write_ops_log_entry(rgw_log_entry& entry) const } } +rgw::auth::LocalApplier::LocalApplier(CephContext* const cct, + std::unique_ptr<rgw::sal::User> user, + std::optional<RGWAccountInfo> account, + std::vector<IAM::Policy> policies, + std::string subuser, + const std::optional<uint32_t>& perm_mask, + const std::string access_key_id) + : user_info(user->get_info()), + user(std::move(user)), + account(std::move(account)), + policies(std::move(policies)), + subuser(std::move(subuser)), + perm_mask(perm_mask.value_or(RGW_PERM_INVALID)), + access_key_id(access_key_id) { +} + ACLOwner rgw::auth::RoleApplier::get_aclowner() const { ACLOwner owner; @@ -1183,10 +1209,11 @@ bool rgw::auth::RoleApplier::is_identity(const Principal& p) const { return false; } -void rgw::auth::RoleApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */ +auto rgw::auth::RoleApplier::load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> /* out */ { /* Load the user id */ - user_info.user_id = this->token_attrs.user_id; + std::unique_ptr<rgw::sal::User> user = driver->get_user(this->token_attrs.user_id); + return user; } void rgw::auth::RoleApplier::write_ops_log_entry(rgw_log_entry& entry) const @@ -1267,9 +1294,10 @@ rgw::auth::AnonymousEngine::authenticate(const DoutPrefixProvider* dpp, const re } else { RGWUserInfo user_info; rgw_get_anon_user(user_info); - + std::unique_ptr<rgw::sal::User> user = s->user->clone(); + user->get_info() = user_info; auto apl = \ - apl_factory->create_apl_local(cct, s, user_info, std::nullopt, {}, + apl_factory->create_apl_local(cct, s, std::move(user), std::nullopt, {}, rgw::auth::LocalApplier::NO_SUBUSER, std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY); return result_t::grant(std::move(apl)); diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h index f3edbbab845..22b0816bac9 100644 --- a/src/rgw/rgw_auth.h +++ b/src/rgw/rgw_auth.h @@ -105,7 +105,8 @@ inline std::ostream& operator<<(std::ostream& out, auto transform_old_authinfo(const DoutPrefixProvider* dpp, optional_yield y, sal::Driver* driver, - sal::User* user) + sal::User* user, + std::vector<IAM::Policy>* policies_ = nullptr) -> tl::expected<std::unique_ptr<Identity>, int>; // Load the user account and all user/group policies. May throw @@ -139,7 +140,7 @@ public: * * XXX: be aware that the "account" term refers to rgw_user. The naming * is legacy. */ - virtual void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const = 0; /* out */ + virtual auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> = 0; /* out */ /* Apply any changes to request state. This method will be most useful for * TempURL of Swift API. */ @@ -484,7 +485,7 @@ public: bool is_identity(const Principal& p) const override; - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override; uint32_t get_identity_type() const override { return TYPE_WEB; @@ -656,7 +657,7 @@ public: uint32_t get_perm_mask() const override { return info.perm_mask; } void to_str(std::ostream& out) const override; - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override; /* out */ void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; void write_ops_log_entry(rgw_log_entry& entry) const override; uint32_t get_identity_type() const override { return info.acct_type; } @@ -683,7 +684,7 @@ public: /* rgw::auth::LocalApplier targets those auth engines that base on the data - * enclosed in the RGWUserInfo control structure. As a side effect of doing + * enclosed in the rgw::sal::User->RGWUserInfo control structure. As a side effect of doing * the authentication process, they must have it loaded. Leveraging this is * a way to avoid unnecessary calls to underlying RADOS store. */ class LocalApplier : public IdentityApplier { @@ -691,6 +692,7 @@ class LocalApplier : public IdentityApplier { protected: const RGWUserInfo user_info; + mutable std::unique_ptr<rgw::sal::User> user; const std::optional<RGWAccountInfo> account; const std::vector<IAM::Policy> policies; const std::string subuser; @@ -705,19 +707,12 @@ public: static const std::string NO_ACCESS_KEY; LocalApplier(CephContext* const cct, - const RGWUserInfo& user_info, + std::unique_ptr<rgw::sal::User> user, std::optional<RGWAccountInfo> account, std::vector<IAM::Policy> policies, std::string subuser, const std::optional<uint32_t>& perm_mask, - const std::string access_key_id) - : user_info(user_info), - account(std::move(account)), - policies(std::move(policies)), - subuser(std::move(subuser)), - perm_mask(perm_mask.value_or(RGW_PERM_INVALID)), - access_key_id(access_key_id) { - } + const std::string access_key_id); ACLOwner get_aclowner() const override; uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override; @@ -732,7 +727,7 @@ public: } } void to_str(std::ostream& out) const override; - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override; /* out */ void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; uint32_t get_identity_type() const override { return user_info.type; } std::string get_acct_name() const override { return {}; } @@ -750,7 +745,7 @@ public: virtual ~Factory() {} virtual aplptr_t create_apl_local(CephContext* cct, const req_state* s, - const RGWUserInfo& user_info, + std::unique_ptr<rgw::sal::User> user, std::optional<RGWAccountInfo> account, std::vector<IAM::Policy> policies, const std::string& subuser, @@ -779,15 +774,20 @@ public: std::vector<std::pair<std::string, std::string>> principal_tags; }; protected: + CephContext* const cct; + rgw::sal::Driver* driver; Role role; TokenAttrs token_attrs; public: RoleApplier(CephContext* const cct, + rgw::sal::Driver* driver, const Role& role, const TokenAttrs& token_attrs) - : role(role), + : cct(cct), + driver(driver), + role(role), token_attrs(token_attrs) {} ACLOwner get_aclowner() const override; @@ -803,7 +803,7 @@ public: return RGW_PERM_NONE; } void to_str(std::ostream& out) const override; - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override; /* out */ uint32_t get_identity_type() const override { return TYPE_ROLE; } std::string get_acct_name() const override { return {}; } std::string get_subuser() const override { return {}; } diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h index a93641e8b8e..7d264197c52 100644 --- a/src/rgw/rgw_auth_filters.h +++ b/src/rgw/rgw_auth_filters.h @@ -117,8 +117,8 @@ public: return get_decoratee().get_account(); } - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override { /* out */ - return get_decoratee().load_acct_info(dpp, user_info); + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override { /* out */ + return get_decoratee().load_acct_info(dpp); } void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override { /* in/out */ @@ -152,7 +152,7 @@ public: } void to_str(std::ostream& out) const override; - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override; /* out */ }; /* static declaration: UNKNOWN_ACCT will be an empty rgw_user that is a result @@ -169,23 +169,25 @@ void ThirdPartyAccountApplier<T>::to_str(std::ostream& out) const } template <typename T> -void ThirdPartyAccountApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const +auto ThirdPartyAccountApplier<T>::load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> { + std::unique_ptr<rgw::sal::User> luser; if (UNKNOWN_ACCT == acct_user_override) { /* There is no override specified by the upper layer. This means that we'll * load the account owned by the authenticated identity (aka auth_user). */ - DecoratedApplier<T>::load_acct_info(dpp, user_info); + luser = DecoratedApplier<T>::load_acct_info(dpp); } else if (DecoratedApplier<T>::is_owner_of(acct_user_override)) { /* The override has been specified but the account belongs to the authenticated * identity. We may safely forward the call to a next stage. */ - DecoratedApplier<T>::load_acct_info(dpp, user_info); + luser = DecoratedApplier<T>::load_acct_info(dpp); } else if (this->is_anonymous()) { /* If the user was authed by the anonymous engine then scope the ANON user * to the correct tenant */ + luser = driver->get_user(rgw_user(RGW_USER_ANON_ID)); if (acct_user_override.tenant.empty()) - user_info.user_id = rgw_user(acct_user_override.id, RGW_USER_ANON_ID); + luser->get_info().user_id = rgw_user(acct_user_override.id, RGW_USER_ANON_ID); else - user_info.user_id = rgw_user(acct_user_override.tenant, RGW_USER_ANON_ID); + luser->get_info().user_id = rgw_user(acct_user_override.tenant, RGW_USER_ANON_ID); } else { /* Compatibility mechanism for multi-tenancy. For more details refer to * load_acct_info method of rgw::auth::RemoteApplier. */ @@ -196,9 +198,10 @@ void ThirdPartyAccountApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, user = driver->get_user(tenanted_uid); if (user->load_user(dpp, null_yield) >= 0) { - user_info = user->get_info(); + // the user_info in luser is initialized by user->load_user(...) + luser = user->clone(); /* Succeeded. */ - return; + return luser; } } @@ -213,8 +216,10 @@ void ThirdPartyAccountApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, throw ret; } } - user_info = user->get_info(); + // the user_info in luser is initialized by user->load_user(...) + luser = user->clone(); } + return luser; } template <typename T> static inline @@ -248,7 +253,7 @@ public: } void to_str(std::ostream& out) const override; - void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */ + auto load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> override; /* out */ void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override; /* in/out */ ACLOwner get_aclowner() const override { @@ -271,10 +276,10 @@ void SysReqApplier<T>::to_str(std::ostream& out) const } template <typename T> -void SysReqApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const +auto SysReqApplier<T>::load_acct_info(const DoutPrefixProvider* dpp) const -> std::unique_ptr<rgw::sal::User> { - DecoratedApplier<T>::load_acct_info(dpp, user_info); - is_system = user_info.system; + std::unique_ptr<rgw::sal::User> user = DecoratedApplier<T>::load_acct_info(dpp); + is_system = user->get_info().system; if (is_system) { //ldpp_dout(dpp, 20) << "system request" << dendl; @@ -285,7 +290,7 @@ void SysReqApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo effective_owner->id = parse_owner(str); if (const auto* uid = std::get_if<rgw_user>(&effective_owner->id); uid) { - std::unique_ptr<rgw::sal::User> user = driver->get_user(*uid); + user = driver->get_user(*uid); if (user->load_user(dpp, null_yield) < 0) { //ldpp_dout(dpp, 0) << "User lookup failed!" << dendl; throw -EACCES; @@ -294,14 +299,14 @@ void SysReqApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo } } } + return user; } template <typename T> void SysReqApplier<T>::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s) const { if (boost::logic::indeterminate(is_system)) { - RGWUserInfo unused_info; - load_acct_info(dpp, unused_info); + std::unique_ptr<rgw::sal::User> unused_user{ load_acct_info(dpp) }; } if (is_system) { diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h index 2f7fd2d7598..5815a520e02 100644 --- a/src/rgw/rgw_auth_s3.h +++ b/src/rgw/rgw_auth_s3.h @@ -55,14 +55,14 @@ class STSAuthStrategy : public rgw::auth::Strategy, aplptr_t create_apl_local(CephContext* const cct, const req_state* const s, - const RGWUserInfo& user_info, + std::unique_ptr<rgw::sal::User> user, std::optional<RGWAccountInfo> account, std::vector<IAM::Policy> policies, const std::string& subuser, const std::optional<uint32_t>& perm_mask, const std::string& access_key_id) const override { auto apl = rgw::auth::add_sysreq(cct, driver, s, - LocalApplier(cct, user_info, std::move(account), std::move(policies), + LocalApplier(cct, std::move(user), std::move(account), std::move(policies), subuser, perm_mask, access_key_id)); return aplptr_t(new decltype(apl)(std::move(apl))); } @@ -72,7 +72,7 @@ class STSAuthStrategy : public rgw::auth::Strategy, RoleApplier::Role role, RoleApplier::TokenAttrs token_attrs) const override { auto apl = rgw::auth::add_sysreq(cct, driver, s, - rgw::auth::RoleApplier(cct, std::move(role), std::move(token_attrs))); + rgw::auth::RoleApplier(cct, driver, std::move(role), std::move(token_attrs))); return aplptr_t(new decltype(apl)(std::move(apl))); } @@ -176,14 +176,14 @@ class AWSAuthStrategy : public rgw::auth::Strategy, aplptr_t create_apl_local(CephContext* const cct, const req_state* const s, - const RGWUserInfo& user_info, + std::unique_ptr<rgw::sal::User> user, std::optional<RGWAccountInfo> account, std::vector<IAM::Policy> policies, const std::string& subuser, const std::optional<uint32_t>& perm_mask, const std::string& access_key_id) const override { auto apl = rgw::auth::add_sysreq(cct, driver, s, - LocalApplier(cct, user_info, std::move(account), std::move(policies), + LocalApplier(cct, std::move(user), std::move(account), std::move(policies), subuser, perm_mask, access_key_id)); /* TODO(rzarzynski): replace with static_ptr. */ return aplptr_t(new decltype(apl)(std::move(apl))); diff --git a/src/rgw/rgw_bucket_logging.cc b/src/rgw/rgw_bucket_logging.cc index d24a53024f1..dd407f26e8c 100644 --- a/src/rgw/rgw_bucket_logging.cc +++ b/src/rgw/rgw_bucket_logging.cc @@ -192,7 +192,7 @@ ceph::coarse_real_time time_from_name(const std::string& obj_name, const DoutPre ldpp_dout(dpp, 1) << "ERROR: logging object name too short: " << obj_name << dendl; return extracted_time; } - const auto time_start_pos = obj_name_length - (time_format_length + UniqueStringLength + 1); + const auto time_start_pos = obj_name_length - (time_format_length + UniqueStringLength + 1); // note: +1 is for the dash between the timestamp and the unique string std::string time_str = obj_name.substr(time_start_pos, time_format_length); @@ -206,6 +206,13 @@ ceph::coarse_real_time time_from_name(const std::string& obj_name, const DoutPre return extracted_time; } +std::string full_bucket_name(const std::unique_ptr<rgw::sal::Bucket>& bucket) { + if (bucket->get_tenant().empty()) { + return bucket->get_name(); + } + return fmt::format("{}:{}", bucket->get_tenant(), bucket->get_name()); +} + int new_logging_object(const configuration& conf, const std::unique_ptr<rgw::sal::Bucket>& bucket, std::string& obj_name, @@ -235,23 +242,22 @@ int new_logging_object(const configuration& conf, conf.target_prefix, to_string(bucket->get_owner()), source_region, - bucket->get_name(), + full_bucket_name(bucket), t, t, unique); } break; } - int ret = bucket->set_logging_object_name(obj_name, conf.target_prefix, y, dpp, init_obj, objv_tracker); if (ret == -EEXIST || ret == -ECANCELED) { if (ret = bucket->get_logging_object_name(obj_name, conf.target_prefix, y, dpp, nullptr); ret < 0) { ldpp_dout(dpp, 1) << "ERROR: failed to get name of logging object of bucket '" << - conf.target_bucket << "'. ret = " << ret << dendl; + conf.target_bucket << "' and prefix '" << conf.target_prefix << "', ret = " << ret << dendl; return ret; } ldpp_dout(dpp, 20) << "INFO: name already set. got name of logging object '" << obj_name << "' of bucket '" << - conf.target_bucket << "'" << dendl; + conf.target_bucket << "' and prefix '" << conf.target_prefix << "'" << dendl; return -ECANCELED; } else if (ret < 0) { ldpp_dout(dpp, 1) << "ERROR: failed to write name of logging object '" << obj_name << "' of bucket '" << @@ -263,6 +269,44 @@ int new_logging_object(const configuration& conf, return 0; } +int commit_logging_object(const configuration& conf, + const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const std::string& tenant_name, + optional_yield y) { + std::string target_bucket_name; + std::string target_tenant_name; + auto ret = rgw_parse_url_bucket(conf.target_bucket, tenant_name, target_tenant_name, target_bucket_name); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to parse target bucket '" << conf.target_bucket << "' when commiting logging object, ret = " + << ret << dendl; + return ret; + } + const rgw_bucket target_bucket_id(target_tenant_name, target_bucket_name); + std::unique_ptr<rgw::sal::Bucket> target_bucket; + ret = driver->load_bucket(dpp, target_bucket_id, + &target_bucket, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to get target logging bucket '" << target_bucket_id << "' when commiting logging object, ret = " + << ret << dendl; + return ret; + } + return commit_logging_object(conf, target_bucket, dpp, y); +} + +int commit_logging_object(const configuration& conf, + const std::unique_ptr<rgw::sal::Bucket>& target_bucket, + const DoutPrefixProvider *dpp, + optional_yield y) { + std::string obj_name; + if (const auto ret = target_bucket->get_logging_object_name(obj_name, conf.target_prefix, y, dpp, nullptr); ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to get name of logging object of bucket '" << + target_bucket->get_info().bucket << "'. ret = " << ret << dendl; + return ret; + } + return target_bucket->commit_logging_object(obj_name, y, dpp); +} + int rollover_logging_object(const configuration& conf, const std::unique_ptr<rgw::sal::Bucket>& bucket, std::string& obj_name, @@ -270,12 +314,16 @@ int rollover_logging_object(const configuration& conf, optional_yield y, bool must_commit, RGWObjVersionTracker* objv_tracker) { - if (conf.target_bucket != bucket->get_name()) { - ldpp_dout(dpp, 1) << "ERROR: bucket name mismatch: '" << conf.target_bucket << "' != '" << bucket->get_name() << "'" << dendl; + std::string target_bucket_name; + std::string target_tenant_name; + std::ignore = rgw_parse_url_bucket(conf.target_bucket, bucket->get_tenant(), target_tenant_name, target_bucket_name); + if (target_bucket_name != bucket->get_name() || target_tenant_name != bucket->get_tenant()) { + ldpp_dout(dpp, 1) << "ERROR: bucket name mismatch. conf= '" << conf.target_bucket << + "', bucket= '" << bucket->get_info().bucket << "'" << dendl; return -EINVAL; } const auto old_obj = obj_name; - const auto ret = new_logging_object(conf, bucket, obj_name, dpp, y, false, objv_tracker); + const auto ret = new_logging_object(conf, bucket, obj_name, dpp, y, false, objv_tracker); if (ret == -ECANCELED) { ldpp_dout(dpp, 20) << "INFO: rollover already performed for '" << old_obj << "' to bucket '" << conf.target_bucket << "'. ret = " << ret << dendl; @@ -342,14 +390,14 @@ S3 bucket short (ceph) log record - eTag };*/ -int log_record(rgw::sal::Driver* driver, +int log_record(rgw::sal::Driver* driver, const sal::Object* obj, - const req_state* s, - const std::string& op_name, - const std::string& etag, + const req_state* s, + const std::string& op_name, + const std::string& etag, size_t size, const configuration& conf, - const DoutPrefixProvider *dpp, + const DoutPrefixProvider *dpp, optional_yield y, bool async_completion, bool log_source_bucket) { @@ -357,11 +405,19 @@ int log_record(rgw::sal::Driver* driver, ldpp_dout(dpp, 1) << "ERROR: only bucket operations are logged" << dendl; return -EINVAL; } + std::string target_bucket_name; + std::string target_tenant_name; + auto ret = rgw_parse_url_bucket(conf.target_bucket, s->bucket_tenant, target_tenant_name, target_bucket_name); + if (ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to parse target bucket '" << conf.target_bucket << "', ret = " << ret << dendl; + return ret; + } + const rgw_bucket target_bucket_id(target_tenant_name, target_bucket_name); std::unique_ptr<rgw::sal::Bucket> target_bucket; - auto ret = driver->load_bucket(dpp, rgw_bucket(s->bucket_tenant, conf.target_bucket), + ret = driver->load_bucket(dpp, target_bucket_id, &target_bucket, y); if (ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to get target logging bucket '" << conf.target_bucket << "'. ret = " << ret << dendl; + ldpp_dout(dpp, 1) << "ERROR: failed to get target logging bucket '" << target_bucket_id << "'. ret = " << ret << dendl; return ret; } std::string obj_name; @@ -382,12 +438,14 @@ int log_record(rgw::sal::Driver* driver, // try to create the temporary log object for the first time ret = new_logging_object(conf, target_bucket, obj_name, dpp, y, true, nullptr); if (ret == 0) { - ldpp_dout(dpp, 20) << "INFO: first time logging for bucket '" << conf.target_bucket << "'" << dendl; + ldpp_dout(dpp, 20) << "INFO: first time logging for bucket '" << conf.target_bucket << "' and prefix '" << + conf.target_prefix << "'" << dendl; } else if (ret == -ECANCELED) { - ldpp_dout(dpp, 20) << "INFO: logging object '" << obj_name << "' already exists for bucket '" << conf.target_bucket << "', will be used" << dendl; + ldpp_dout(dpp, 20) << "INFO: logging object '" << obj_name << "' already exists for bucket '" << conf.target_bucket << "' and prefix" << + conf.target_prefix << "'" << dendl; } else { ldpp_dout(dpp, 1) << "ERROR: failed to create logging object of bucket '" << - conf.target_bucket << "' for the first time. ret = " << ret << dendl; + conf.target_bucket << "' and prefix '" << conf.target_prefix << "' for the first time. ret = " << ret << dendl; return ret; } } else { @@ -420,7 +478,7 @@ int log_record(rgw::sal::Driver* driver, bucket_name = s->src_bucket_name; } else { bucket_owner = to_string( s->bucket->get_owner()); - bucket_name = s->bucket->get_name(); + bucket_name = full_bucket_name(s->bucket); } switch (conf.logging_type) { @@ -459,7 +517,7 @@ int log_record(rgw::sal::Driver* driver, case LoggingType::Journal: record = fmt::format("{} {} [{:%d/%b/%Y:%H:%M:%S %z}] {} {} {} {} {}", dash_if_empty(to_string(s->bucket->get_owner())), - dash_if_empty(s->bucket->get_name()), + dash_if_empty(full_bucket_name(s->bucket)), t, op_name, dash_if_empty_or_null(obj, obj->get_name()), @@ -512,12 +570,12 @@ std::string object_name_oid(const rgw::sal::Bucket* bucket, const std::string& p int log_record(rgw::sal::Driver* driver, LoggingType type, const sal::Object* obj, - const req_state* s, - const std::string& op_name, - const std::string& etag, - size_t size, - const DoutPrefixProvider *dpp, - optional_yield y, + const req_state* s, + const std::string& op_name, + const std::string& etag, + size_t size, + const DoutPrefixProvider *dpp, + optional_yield y, bool async_completion, bool log_source_bucket) { if (!s->bucket) { @@ -534,7 +592,7 @@ int log_record(rgw::sal::Driver* driver, try { configuration.enabled = true; auto bl_iter = iter->second.cbegin(); - decode(configuration, bl_iter); + decode(configuration, bl_iter); if (type != LoggingType::Any && configuration.logging_type != type) { return 0; } @@ -543,20 +601,199 @@ int log_record(rgw::sal::Driver* driver, return 0; } } - ldpp_dout(dpp, 20) << "INFO: found matching logging configuration of bucket '" << s->bucket->get_name() << + ldpp_dout(dpp, 20) << "INFO: found matching logging configuration of bucket '" << s->bucket->get_info().bucket << "' configuration: " << configuration.to_json_str() << dendl; - if (auto ret = log_record(driver, obj, s, op_name, etag, size, configuration, dpp, y, async_completion, log_source_bucket); ret < 0) { - ldpp_dout(dpp, 1) << "ERROR: failed to perform logging for bucket '" << s->bucket->get_name() << + if (auto ret = log_record(driver, obj, s, op_name, etag, size, configuration, dpp, y, async_completion, log_source_bucket); ret < 0) { + ldpp_dout(dpp, 1) << "ERROR: failed to perform logging for bucket '" << s->bucket->get_info().bucket << "'. ret=" << ret << dendl; return ret; } } catch (buffer::error& err) { - ldpp_dout(dpp, 1) << "ERROR: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING + ldpp_dout(dpp, 1) << "ERROR: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING << "'. error: " << err.what() << dendl; return -EINVAL; } return 0; } +int get_bucket_id(const std::string& bucket_name, const std::string& tenant_name, rgw_bucket& bucket_id) { + std::string parsed_bucket_name; + std::string parsed_tenant_name; + if (const auto ret = rgw_parse_url_bucket(bucket_name, tenant_name, parsed_tenant_name, parsed_bucket_name); ret < 0) { + return ret; + } + bucket_id = rgw_bucket{parsed_tenant_name, parsed_bucket_name}; + return 0; +} + +int update_bucket_logging_sources(const DoutPrefixProvider* dpp, rgw::sal::Driver* driver, const rgw_bucket& target_bucket_id, const rgw_bucket& src_bucket_id, bool add, optional_yield y) { + std::unique_ptr<rgw::sal::Bucket> target_bucket; + const auto ret = driver->load_bucket(dpp, target_bucket_id, &target_bucket, y); + if (ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to get target bucket '" << target_bucket_id << "', ret = " << ret << dendl; + return ret; + } + return update_bucket_logging_sources(dpp, target_bucket, src_bucket_id, add, y); +} + +int update_bucket_logging_sources(const DoutPrefixProvider* dpp, std::unique_ptr<rgw::sal::Bucket>& bucket, const rgw_bucket& src_bucket_id, bool add, optional_yield y) { + return retry_raced_bucket_write(dpp, bucket.get(), [dpp, &bucket, &src_bucket_id, add, y] { + auto& attrs = bucket->get_attrs(); + auto iter = attrs.find(RGW_ATTR_BUCKET_LOGGING_SOURCES); + if (iter == attrs.end()) { + if (!add) { + ldpp_dout(dpp, 20) << "INFO: no logging sources attribute '" << RGW_ATTR_BUCKET_LOGGING_SOURCES + << "' for bucket '" << bucket->get_info().bucket << "', nothing to remove" << dendl; + return 0; + } + source_buckets sources{src_bucket_id}; + bufferlist bl; + ceph::encode(sources, bl); + attrs.insert(std::make_pair(RGW_ATTR_BUCKET_LOGGING_SOURCES, std::move(bl))); + return bucket->merge_and_store_attrs(dpp, attrs, y); + } + try { + source_buckets sources; + ceph::decode(sources, iter->second); + if ((add && sources.insert(src_bucket_id).second) || + (!add && sources.erase(src_bucket_id) > 0)) { + bufferlist bl; + ceph::encode(sources, bl); + iter->second = std::move(bl); + return bucket->merge_and_store_attrs(dpp, attrs, y); + } + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << "WARNING: failed to decode logging sources attribute '" << RGW_ATTR_BUCKET_LOGGING_SOURCES + << "' for bucket '" << bucket->get_info().bucket << "', error: " << err.what() << dendl; + } + ldpp_dout(dpp, 20) << "INFO: logging source '" << src_bucket_id << "' already " << + (add ? "added to" : "removed from") << " bucket '" << bucket->get_info().bucket << "'" << dendl; + return 0; + }, y); +} + + +int bucket_deletion_cleanup(const DoutPrefixProvider* dpp, + sal::Driver* driver, + sal::Bucket* bucket, + optional_yield y) { + // if the bucket is used a log bucket, we should delete all pending log objects + // and also delete the object holding the pending object name + auto& attrs = bucket->get_attrs(); + if (const auto iter = attrs.find(RGW_ATTR_BUCKET_LOGGING_SOURCES); iter != attrs.end()) { + try { + source_buckets sources; + ceph::decode(sources, iter->second); + for (const auto& source : sources) { + std::unique_ptr<rgw::sal::Bucket> src_bucket; + if (const auto ret = driver->load_bucket(dpp, source, &src_bucket, y); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to get logging source bucket '" << source << "' for log bucket '" << + bucket->get_info().bucket << "', ret = " << ret << dendl; + continue; + } + auto& src_attrs = src_bucket->get_attrs(); + if (const auto iter = src_attrs.find(RGW_ATTR_BUCKET_LOGGING); iter != src_attrs.end()) { + configuration conf; + try { + auto bl_iter = iter->second.cbegin(); + decode(conf, bl_iter); + std::string obj_name; + RGWObjVersionTracker objv; + if (const auto ret = bucket->get_logging_object_name(obj_name, conf.target_prefix, y, dpp, &objv); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to get logging object name for log bucket '" << bucket->get_info().bucket << + "', ret = " << ret << dendl; + continue; + } + if (const auto ret = bucket->remove_logging_object(obj_name, y, dpp); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to delete pending logging object '" << obj_name << "' for log bucket '" << + bucket->get_info().bucket << "', ret = " << ret << dendl; + continue; + } + ldpp_dout(dpp, 20) << "INFO: successfully deleted pending logging object '" << obj_name << "' from deleted log bucket '" << + bucket->get_info().bucket << "'" << dendl; + if (const auto ret = bucket->remove_logging_object_name(conf.target_prefix, y, dpp, &objv); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to delete object holding bucket logging object name for log bucket '" << + bucket->get_info().bucket << "', ret = " << ret << dendl; + continue; + } + ldpp_dout(dpp, 20) << "INFO: successfully deleted object holding bucket logging object name from deleted log bucket '" << + bucket->get_info().bucket << "'" << dendl; + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << "WARNING: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING + << "' of bucket '" << src_bucket->get_info().bucket << "', error: " << err.what() << dendl; + } + } + } + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << "WARNING: failed to decode logging sources attribute '" << RGW_ATTR_BUCKET_LOGGING_SOURCES + << "' for bucket '" << bucket->get_info().bucket << "', error: " << err.what() << dendl; + return -EIO; + } + } + + return source_bucket_cleanup(dpp, driver, bucket, false, y); +} + +int source_bucket_cleanup(const DoutPrefixProvider* dpp, + sal::Driver* driver, + sal::Bucket* bucket, + bool remove_attr, + optional_yield y) { + std::optional<configuration> conf; + const auto& info = bucket->get_info(); + if (const auto ret = retry_raced_bucket_write(dpp, bucket, [dpp, bucket, &conf, &info, remove_attr, y] { + auto& attrs = bucket->get_attrs(); + if (auto iter = attrs.find(RGW_ATTR_BUCKET_LOGGING); iter != attrs.end()) { + try { + auto bl_iter = iter->second.cbegin(); + configuration tmp_conf; + tmp_conf.enabled = true; + decode(tmp_conf, bl_iter); + conf = std::move(tmp_conf); + } catch (buffer::error& err) { + ldpp_dout(dpp, 1) << "WARNING: failed to decode existing logging attribute '" << RGW_ATTR_BUCKET_LOGGING + << "' of bucket '" << info.bucket << "', error: " << err.what() << dendl; + return -EIO; + } + if (remove_attr) { + attrs.erase(iter); + return bucket->merge_and_store_attrs(dpp, attrs, y); + } + } + // nothing to remove or no need to remove + return 0; + }, y); ret < 0) { + if (remove_attr) { + ldpp_dout(dpp, 1) << "ERROR: failed to remove logging attribute '" << RGW_ATTR_BUCKET_LOGGING << "' from bucket '" << + info.bucket << "', ret = " << ret << dendl; + } + return ret; + } + if (!conf) { + // no logging attribute found + return 0; + } + if (const auto ret = commit_logging_object(*conf, dpp, driver, info.bucket.tenant, y); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: could not commit pending logging object of bucket '" << + info.bucket << "', ret = " << ret << dendl; + } else { + ldpp_dout(dpp, 20) << "INFO: successfully committed pending logging object of bucket '" << info.bucket << "'" << dendl; + } + rgw_bucket target_bucket_id; + rgw_bucket src_bucket_id{info.bucket.tenant, info.bucket.name}; + if (const auto ret = get_bucket_id(conf->target_bucket, info.bucket.tenant, target_bucket_id); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: failed to parse target bucket '" << conf->target_bucket << "', ret = " << ret << dendl; + return 0; + } + if (const auto ret = update_bucket_logging_sources(dpp, driver, target_bucket_id, src_bucket_id, false, y); ret < 0) { + ldpp_dout(dpp, 1) << "WARNING: could not update bucket logging source '" << + info.bucket << "', ret = " << ret << dendl; + return 0; + } + ldpp_dout(dpp, 20) << "INFO: successfully updated bucket logging source '" << + info.bucket << "'"<< dendl; + return 0; +} + } // namespace rgw::bucketlogging diff --git a/src/rgw/rgw_bucket_logging.h b/src/rgw/rgw_bucket_logging.h index d4877bafb0f..cbdb8b55f88 100644 --- a/src/rgw/rgw_bucket_logging.h +++ b/src/rgw/rgw_bucket_logging.h @@ -4,7 +4,6 @@ #pragma once #include <string> -#include <optional> #include <cstdint> #include "rgw_sal_fwd.h" #include "include/buffer.h" @@ -16,7 +15,7 @@ class XMLObj; namespace ceph { class Formatter; } class DoutPrefixProvider; struct req_state; -class RGWObjVersionTracker; +struct RGWObjVersionTracker; class RGWOp; namespace rgw::bucketlogging { @@ -66,6 +65,17 @@ enum class LoggingType {Standard, Journal, Any}; enum class PartitionDateSource {DeliveryTime, EventTime}; struct configuration { + bool operator==(const configuration& rhs) const { + return enabled == rhs.enabled && + target_bucket == rhs.target_bucket && + obj_key_format == rhs.obj_key_format && + target_prefix == rhs.target_prefix && + obj_roll_time == rhs.obj_roll_time && + logging_type == rhs.logging_type && + records_batch_size == rhs.records_batch_size && + date_source == rhs.date_source && + key_filter == rhs.key_filter; + } uint32_t default_obj_roll_time = 300; bool enabled = false; std::string target_bucket; @@ -129,6 +139,8 @@ struct configuration { }; WRITE_CLASS_ENCODER(configuration) +using source_buckets = std::set<rgw_bucket>; + constexpr unsigned MAX_BUCKET_LOGGING_BUFFER = 1000; using bucket_logging_records = std::array<std::string, MAX_BUCKET_LOGGING_BUFFER>; @@ -155,7 +167,7 @@ int log_record(rgw::sal::Driver* driver, bool async_completion, bool log_source_bucket); -// commit the pending log objec tto the log bucket +// commit the pending log objec to the log bucket // and create a new pending log object // if "must_commit" is "false" the function will return success even if the pending log object was not committed int rollover_logging_object(const configuration& conf, @@ -166,6 +178,23 @@ int rollover_logging_object(const configuration& conf, bool must_commit, RGWObjVersionTracker* objv_tracker); +// commit the pending log object to the log bucket +// use this for cleanup, when new pending object is not needed +// and target bucket is known +int commit_logging_object(const configuration& conf, + const std::unique_ptr<rgw::sal::Bucket>& target_bucket, + const DoutPrefixProvider *dpp, + optional_yield y); + +// commit the pending log object to the log bucket +// use this for cleanup, when new pending object is not needed +// and target bucket shoud be loaded based on the configuration +int commit_logging_object(const configuration& conf, + const DoutPrefixProvider *dpp, + rgw::sal::Driver* driver, + const std::string& tenant_name, + optional_yield y); + // return the oid of the object holding the name of the temporary logging object // bucket - log bucket // prefix - logging prefix from configuration. should be used when multiple buckets log into the same log bucket @@ -185,5 +214,37 @@ int log_record(rgw::sal::Driver* driver, optional_yield y, bool async_completion, bool log_source_bucket); + +// return (by ref) an rgw_bucket object with the bucket name and tenant name +// fails if the bucket name is not in the format: [tenant name:]<bucket name> +int get_bucket_id(const std::string& bucket_name, const std::string& tenant_name, rgw_bucket& bucket_id); + +// update (add or remove) a source bucket from the list of source buckets in the target bucket +// use this function when the target bucket is already loaded +int update_bucket_logging_sources(const DoutPrefixProvider* dpp, std::unique_ptr<rgw::sal::Bucket>& bucket, + const rgw_bucket& src_bucket, bool add, optional_yield y); + +// update (add or remove) a source bucket from the list of source buckets in the target bucket +// use this function when the target bucket is not known and needs to be loaded +int update_bucket_logging_sources(const DoutPrefixProvider* dpp, rgw::sal::Driver* driver, const rgw_bucket& target_bucket_id, + const rgw_bucket& src_bucket_id, bool add, optional_yield y); + +// when source bucket is deleted, all pending log objects should be comitted to the log bucket +// when the target bucket is deleted, all pending log objects should be deleted, as well as the object holding the pending log object name +int bucket_deletion_cleanup(const DoutPrefixProvider* dpp, + sal::Driver* driver, + sal::Bucket* bucket, + optional_yield y); + +// if bucket has bucket logging configuration associated with it then: +// if "remove_attr" is true, the bucket logging configuration should be removed from the bucket +// in addition: +// any pending log objects should be comitted to the log bucket +// and the log bucket should be updated to remove the bucket as a source +int source_bucket_cleanup(const DoutPrefixProvider* dpp, + sal::Driver* driver, + sal::Bucket* bucket, + bool remove_attr, + optional_yield y); } // namespace rgw::bucketlogging diff --git a/src/rgw/rgw_cksum_pipe.cc b/src/rgw/rgw_cksum_pipe.cc index e06957e2715..0bec8d341af 100644 --- a/src/rgw/rgw_cksum_pipe.cc +++ b/src/rgw/rgw_cksum_pipe.cc @@ -18,6 +18,7 @@ #include <string> #include <fmt/format.h> #include <boost/algorithm/string.hpp> +#include "rgw_cksum.h" #include "rgw_common.h" #include "common/dout.h" #include "rgw_client_io.h" @@ -34,7 +35,8 @@ namespace rgw::putobj { {} std::unique_ptr<RGWPutObj_Cksum> RGWPutObj_Cksum::Factory( - rgw::sal::DataProcessor* next, const RGWEnv& env) + rgw::sal::DataProcessor* next, const RGWEnv& env, + rgw::cksum::Type override_type) { /* look for matching headers */ auto algo_header = cksum_algorithm_hdr(env); @@ -49,6 +51,13 @@ namespace rgw::putobj { throw rgw::io::Exception(EINVAL, std::system_category()); } /* no checksum header */ + if (override_type != rgw::cksum::Type::none) { + /* XXXX safe? do we need to fixup env as well? */ + auto algo_header = cksum_algorithm_hdr(override_type); + return + std::make_unique<RGWPutObj_Cksum>( + next, override_type, std::move(algo_header)); + } return std::unique_ptr<RGWPutObj_Cksum>(); } diff --git a/src/rgw/rgw_cksum_pipe.h b/src/rgw/rgw_cksum_pipe.h index fddcd283c84..c459d156335 100644 --- a/src/rgw/rgw_cksum_pipe.h +++ b/src/rgw/rgw_cksum_pipe.h @@ -20,6 +20,7 @@ #include <tuple> #include <cstring> #include <boost/algorithm/string/case_conv.hpp> +#include "rgw_cksum.h" #include "rgw_cksum_digest.h" #include "rgw_common.h" #include "rgw_putobj.h" @@ -29,6 +30,38 @@ namespace rgw::putobj { namespace cksum = rgw::cksum; using cksum_hdr_t = std::pair<const char*, const char*>; + static inline const cksum_hdr_t cksum_algorithm_hdr(rgw::cksum::Type t) { + static constexpr std::string_view hdr = + "HTTP_X_AMZ_SDK_CHECKSUM_ALGORITHM"; + using rgw::cksum::Type; + switch (t) { + case Type::sha256: + return cksum_hdr_t(hdr.data(), "SHA256"); + break; + case Type::crc32: + return cksum_hdr_t(hdr.data(), "CRC32"); + break; + case Type::crc32c: + return cksum_hdr_t(hdr.data(), "CRC32C"); + break; + case Type::xxh3: + return cksum_hdr_t(hdr.data(), "XX3"); + break; + case Type::sha1: + return cksum_hdr_t(hdr.data(), "SHA1"); + break; + case Type::sha512: + return cksum_hdr_t(hdr.data(), "SHA512"); + break; + case Type::blake3: + return cksum_hdr_t(hdr.data(), "BLAKE3"); + break; + default: + break; + }; + return cksum_hdr_t(nullptr, nullptr);; + } + static inline const cksum_hdr_t cksum_algorithm_hdr(const RGWEnv& env) { /* If the individual checksum value you provide through x-amz-checksum-algorithm doesn't match the checksum algorithm @@ -102,7 +135,8 @@ namespace rgw::putobj { using VerifyResult = std::tuple<bool, const cksum::Cksum&>; static std::unique_ptr<RGWPutObj_Cksum> Factory( - rgw::sal::DataProcessor* next, const RGWEnv&); + rgw::sal::DataProcessor* next, const RGWEnv&, + rgw::cksum::Type override_type); RGWPutObj_Cksum(rgw::sal::DataProcessor* next, rgw::cksum::Type _type, cksum_hdr_t&& _hdr); diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index 1a59ba02999..6610538542c 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -63,6 +63,7 @@ rgw_http_errors rgw_http_s3_errors({ { ERR_INVALID_DIGEST, {400, "InvalidDigest" }}, { ERR_BAD_DIGEST, {400, "BadDigest" }}, { ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }}, + { ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION, {400, "IllegalLocationConstraintException" }}, { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }}, { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }}, { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }}, @@ -3206,3 +3207,14 @@ void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct) append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN); } +boost::optional<rgw::IAM::Policy> +get_iam_policy_from_attr(CephContext* cct, + const std::map<std::string, bufferlist>& attrs, + const std::string& tenant) +{ + if (auto i = attrs.find(RGW_ATTR_IAM_POLICY); i != attrs.end()) { + return Policy(cct, &tenant, i->second.to_str(), false); + } else { + return boost::none; + } +} diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index d7b0819d356..88f5f7a9c52 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -108,6 +108,7 @@ using ceph::crypto::MD5; #define RGW_ATTR_X_ROBOTS_TAG RGW_ATTR_PREFIX "x-robots-tag" #define RGW_ATTR_STORAGE_CLASS RGW_ATTR_PREFIX "storage_class" #define RGW_ATTR_BUCKET_LOGGING RGW_ATTR_PREFIX "logging" +#define RGW_ATTR_BUCKET_LOGGING_SOURCES RGW_ATTR_PREFIX "logging-sources" /* S3 Object Lock*/ #define RGW_ATTR_OBJECT_LOCK RGW_ATTR_PREFIX "object-lock" @@ -337,6 +338,7 @@ inline constexpr const char* RGW_REST_STS_XMLNS = #define ERR_PRESIGNED_URL_EXPIRED 2223 #define ERR_PRESIGNED_URL_DISABLED 2224 #define ERR_AUTHORIZATION 2225 // SNS 403 AuthorizationError +#define ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION 2226 #define ERR_BUSY_RESHARDING 2300 // also in cls_rgw_types.h, don't change! #define ERR_NO_SUCH_ENTITY 2301 @@ -1748,24 +1750,22 @@ rgw::IAM::Effect evaluate_iam_policies( bool verify_user_permission(const DoutPrefixProvider* dpp, req_state * const s, - const RGWAccessControlPolicy& user_acl, - const std::vector<rgw::IAM::Policy>& user_policies, - const std::vector<rgw::IAM::Policy>& session_policies, - const rgw::ARN& res, - const uint64_t op, - bool mandatory_policy=true); -bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, - req_state * const s, - const RGWAccessControlPolicy& user_acl, - const int perm); -bool verify_user_permission(const DoutPrefixProvider* dpp, - req_state * const s, const rgw::ARN& res, const uint64_t op, bool mandatory_policy=true); bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, req_state * const s, int perm); +bool verify_bucket_permission(const DoutPrefixProvider* dpp, + struct perm_state_base * const s, + const rgw::ARN& arn, + bool account_root, + const RGWAccessControlPolicy& user_acl, + const RGWAccessControlPolicy& bucket_acl, + const boost::optional<rgw::IAM::Policy>& bucket_policy, + const std::vector<rgw::IAM::Policy>& identity_policies, + const std::vector<rgw::IAM::Policy>& session_policies, + const uint64_t op); bool verify_bucket_permission( const DoutPrefixProvider* dpp, req_state * const s, @@ -2013,3 +2013,8 @@ struct AioCompletionDeleter { void operator()(librados::AioCompletion* c) { c->release(); } }; using aio_completion_ptr = std::unique_ptr<librados::AioCompletion, AioCompletionDeleter>; + +extern boost::optional<rgw::IAM::Policy> +get_iam_policy_from_attr(CephContext* cct, + const std::map<std::string, bufferlist>& attrs, + const std::string& tenant); diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc index 2a5c9cd313e..ef6761d4222 100644 --- a/src/rgw/rgw_iam_policy.cc +++ b/src/rgw/rgw_iam_policy.cc @@ -94,6 +94,8 @@ static const actpair actpairs[] = { "s3:GetPublicAccessBlock", s3GetPublicAccessBlock }, { "s3:GetObjectAcl", s3GetObjectAcl }, { "s3:GetObject", s3GetObject }, + { "s3:GetObjectAttributes", s3GetObjectAttributes }, + { "s3:GetObjectVersionAttributes", s3GetObjectVersionAttributes }, { "s3:GetObjectTorrent", s3GetObjectTorrent }, { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl }, { "s3:GetObjectVersion", s3GetObjectVersion }, @@ -1335,6 +1337,7 @@ const char* action_bit_string(uint64_t action) { case s3ListBucketVersions: return "s3:ListBucketVersions"; + case s3ListAllMyBuckets: return "s3:ListAllMyBuckets"; @@ -1479,6 +1482,12 @@ const char* action_bit_string(uint64_t action) { case s3BypassGovernanceRetention: return "s3:BypassGovernanceRetention"; + case s3GetObjectAttributes: + return "s3:GetObjectAttributes"; + + case s3GetObjectVersionAttributes: + return "s3:GetObjectVersionAttributes"; + case s3DescribeJob: return "s3:DescribeJob"; diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h index 0476926143f..dd323ee4b9c 100644 --- a/src/rgw/rgw_iam_policy.h +++ b/src/rgw/rgw_iam_policy.h @@ -115,6 +115,8 @@ enum { s3GetBucketEncryption, s3PutBucketEncryption, s3DescribeJob, + s3GetObjectAttributes, + s3GetObjectVersionAttributes, s3All, s3objectlambdaGetObject, @@ -247,6 +249,8 @@ inline int op_to_perm(std::uint64_t op) { case s3GetObjectVersionTagging: case s3GetObjectRetention: case s3GetObjectLegalHold: + case s3GetObjectAttributes: + case s3GetObjectVersionAttributes: case s3ListAllMyBuckets: case s3ListBucket: case s3ListBucketMultipartUploads: diff --git a/src/rgw/rgw_kafka.cc b/src/rgw/rgw_kafka.cc index 0807993338d..b38b1a78ec4 100644 --- a/src/rgw/rgw_kafka.cc +++ b/src/rgw/rgw_kafka.cc @@ -13,6 +13,7 @@ #include <thread> #include <atomic> #include <mutex> +#include <boost/algorithm/string.hpp> #include <boost/functional/hash.hpp> #include <boost/lockfree/queue.hpp> #include "common/dout.h" @@ -595,7 +596,8 @@ public: boost::optional<const std::string&> ca_location, boost::optional<const std::string&> mechanism, boost::optional<const std::string&> topic_user_name, - boost::optional<const std::string&> topic_password) { + boost::optional<const std::string&> topic_password, + boost::optional<const std::string&> brokers) { if (stopped) { ldout(cct, 1) << "Kafka connect: manager is stopped" << dendl; return false; @@ -603,8 +605,8 @@ public: std::string user; std::string password; - std::string broker; - if (!parse_url_authority(url, broker, user, password)) { + std::string broker_list; + if (!parse_url_authority(url, broker_list, user, password)) { // TODO: increment counter ldout(cct, 1) << "Kafka connect: URL parsing failed" << dendl; return false; @@ -632,7 +634,13 @@ public: ldout(cct, 1) << "Kafka connect: user/password are only allowed over secure connection" << dendl; return false; } - connection_id_t tmp_id(broker, user, password, ca_location, mechanism, + + if (brokers.has_value()) { + broker_list.append(","); + broker_list.append(brokers.get()); + } + + connection_id_t tmp_id(broker_list, user, password, ca_location, mechanism, use_ssl); std::lock_guard lock(connections_lock); const auto it = connections.find(tmp_id); @@ -652,7 +660,7 @@ public: return false; } - auto conn = std::make_unique<connection_t>(cct, broker, use_ssl, verify_ssl, ca_location, user, password, mechanism); + auto conn = std::make_unique<connection_t>(cct, broker_list, use_ssl, verify_ssl, ca_location, user, password, mechanism); if (!new_producer(conn.get())) { ldout(cct, 10) << "Kafka connect: producer creation failed in new connection" << dendl; return false; @@ -770,11 +778,12 @@ bool connect(connection_id_t& conn_id, boost::optional<const std::string&> ca_location, boost::optional<const std::string&> mechanism, boost::optional<const std::string&> user_name, - boost::optional<const std::string&> password) { + boost::optional<const std::string&> password, + boost::optional<const std::string&> brokers) { std::shared_lock lock(s_manager_mutex); if (!s_manager) return false; return s_manager->connect(conn_id, url, use_ssl, verify_ssl, ca_location, - mechanism, user_name, password); + mechanism, user_name, password, brokers); } int publish(const connection_id_t& conn_id, diff --git a/src/rgw/rgw_kafka.h b/src/rgw/rgw_kafka.h index b7aa0d15759..858b185219f 100644 --- a/src/rgw/rgw_kafka.h +++ b/src/rgw/rgw_kafka.h @@ -48,7 +48,8 @@ bool connect(connection_id_t& conn_id, boost::optional<const std::string&> ca_location, boost::optional<const std::string&> mechanism, boost::optional<const std::string&> user_name, - boost::optional<const std::string&> password); + boost::optional<const std::string&> password, + boost::optional<const std::string&> brokers); // publish a message over a connection that was already created int publish(const connection_id_t& conn_id, diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index ee42ab647a1..1793c0b8065 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -25,8 +25,10 @@ #include "common/ceph_json.h" #include "common/static_ptr.h" #include "common/perf_counters_key.h" +#include "rgw_cksum.h" #include "rgw_cksum_digest.h" #include "rgw_common.h" +#include "common/split.h" #include "rgw_tracer.h" #include "rgw_rados.h" @@ -331,19 +333,6 @@ static int get_obj_policy_from_attr(const DoutPrefixProvider *dpp, return ret; } - -static boost::optional<Policy> -get_iam_policy_from_attr(CephContext* cct, - const map<string, bufferlist>& attrs, - const string& tenant) -{ - if (auto i = attrs.find(RGW_ATTR_IAM_POLICY); i != attrs.end()) { - return Policy(cct, &tenant, i->second.to_str(), false); - } else { - return none; - } -} - static boost::optional<PublicAccessBlockConfiguration> get_public_access_conf_from_attr(const map<string, bufferlist>& attrs) { @@ -3571,54 +3560,62 @@ void RGWCreateBucket::execute(optional_yield y) const rgw::SiteConfig& site = *s->penv.site; const std::optional<RGWPeriod>& period = site.get_period(); const RGWZoneGroup& my_zonegroup = site.get_zonegroup(); - - if (s->system_request) { - // allow system requests to override the target zonegroup. for forwarded - // requests, we'll create the bucket for the originating zonegroup - createparams.zonegroup_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup"); - } - + const std::string rgwx_zonegroup = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup"); const RGWZoneGroup* bucket_zonegroup = &my_zonegroup; - if (createparams.zonegroup_id.empty()) { - // default to the local zonegroup - createparams.zonegroup_id = my_zonegroup.id; - } else if (period) { - auto z = period->period_map.zonegroups.find(createparams.zonegroup_id); - if (z == period->period_map.zonegroups.end()) { - ldpp_dout(this, 0) << "could not find zonegroup " - << createparams.zonegroup_id << " in current period" << dendl; - op_ret = -ENOENT; - return; - } - bucket_zonegroup = &z->second; - } else if (createparams.zonegroup_id != my_zonegroup.id) { - ldpp_dout(this, 0) << "zonegroup does not match current zonegroup " - << createparams.zonegroup_id << dendl; - op_ret = -ENOENT; - return; - } - // validate the LocationConstraint + // Validate LocationConstraint if it's provided and enforcement is strict if (!location_constraint.empty() && !relaxed_region_enforcement) { - // on the master zonegroup, allow any valid api_name. otherwise it has to - // match the bucket's zonegroup - if (period && my_zonegroup.is_master) { - if (!period->period_map.zonegroups_by_api.count(location_constraint)) { + if (period) { + auto location_iter = period->period_map.zonegroups_by_api.find(location_constraint); + if (location_iter == period->period_map.zonegroups_by_api.end()) { ldpp_dout(this, 0) << "location constraint (" << location_constraint << ") can't be found." << dendl; op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; - s->err.message = "The specified location-constraint is not valid"; + s->err.message = fmt::format("The {} location constraint is not valid.", + location_constraint); return; } - } else if (bucket_zonegroup->api_name != location_constraint) { + bucket_zonegroup = &location_iter->second; + } else if (location_constraint != my_zonegroup.api_name) { // if we don't have a period, we can only use the current zonegroup - so check if the location matches by api name here ldpp_dout(this, 0) << "location constraint (" << location_constraint - << ") doesn't match zonegroup (" << bucket_zonegroup->api_name - << ')' << dendl; - op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; - s->err.message = "The specified location-constraint is not valid"; + << ") doesn't match zonegroup (" << my_zonegroup.api_name << ")" << dendl; + op_ret = -ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION; + s->err.message = fmt::format("The {} location constraint is incompatible " + "for the region specific endpoint this request was sent to.", + location_constraint); return; } } + // If it's a system request, use the provided zonegroup if available + else if (s->system_request && !rgwx_zonegroup.empty()) { + if (period) { + auto zonegroup_iter = period->period_map.zonegroups.find(rgwx_zonegroup); + if (zonegroup_iter == period->period_map.zonegroups.end()) { + ldpp_dout(this, 0) << "could not find zonegroup " << rgwx_zonegroup + << " in current period" << dendl; + op_ret = -ENOENT; + return; + } + bucket_zonegroup = &zonegroup_iter->second; + } + } + + const bool enforce_location_match = + !period || // No period: no multisite, so no need to enforce location match. + !s->system_request || // All user requests are enforced to match zonegroup's location. + !my_zonegroup.is_master; // but if it's a system request (forwarded) only allow remote creation on master zonegroup. + if (enforce_location_match && !my_zonegroup.equals(bucket_zonegroup->get_id())) { + ldpp_dout(this, 0) << "location constraint (" << bucket_zonegroup->api_name + << ") doesn't match zonegroup (" << my_zonegroup.api_name << ")" << dendl; + op_ret = -ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION; + s->err.message = fmt::format("The {} location constraint is incompatible " + "for the region specific endpoint this request was sent to.", + bucket_zonegroup->api_name); + return; + } + + // Set the final zonegroup ID + createparams.zonegroup_id = bucket_zonegroup->id; // select and validate the placement target op_ret = select_bucket_placement(this, *bucket_zonegroup, s->user->get_info(), @@ -3627,7 +3624,7 @@ void RGWCreateBucket::execute(optional_yield y) return; } - if (bucket_zonegroup == &my_zonegroup) { + if (my_zonegroup.equals(bucket_zonegroup->get_id())) { // look up the zone placement pool createparams.zone_placement = rgw::find_zone_placement( this, site.get_zone_params(), createparams.placement_rule); @@ -3716,7 +3713,6 @@ void RGWCreateBucket::execute(optional_yield y) if (!driver->is_meta_master()) { // apply bucket creation on the master zone first - bufferlist in_data; JSONParser jp; op_ret = rgw_forward_request_to_master(this, *s->penv.site, s->owner.id, &in_data, &jp, s->info, y); @@ -3793,7 +3789,10 @@ void RGWCreateBucket::execute(optional_yield y) s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty(); /* This will also set the quota on the bucket. */ - op_ret = s->bucket->merge_and_store_attrs(this, createparams.attrs, y); + s->bucket->set_attrs(std::move(createparams.attrs)); + constexpr bool exclusive = false; // overwrite + constexpr ceph::real_time no_set_mtime{}; + op_ret = s->bucket->put_info(this, exclusive, no_set_mtime, y); } while (op_ret == -ECANCELED && tries++ < 20); /* Restore the proper return code. */ @@ -4344,6 +4343,9 @@ void RGWPutObj::execute(optional_yield y) } return; } + + multipart_cksum_type = upload->cksum_type; + /* upload will go out of scope, so copy the dest placement for later use */ s->dest_placement = *pdest_placement; pdest_placement = &s->dest_placement; @@ -4474,11 +4476,12 @@ void RGWPutObj::execute(optional_yield y) /* optional streaming checksum */ try { cksum_filter = - rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env); + rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env, multipart_cksum_type); } catch (const rgw::io::Exception& e) { op_ret = -e.code().value(); return; } + if (cksum_filter) { filter = &*cksum_filter; } @@ -4625,10 +4628,12 @@ void RGWPutObj::execute(optional_yield y) if (cksum_filter) { const auto& hdr = cksum_filter->header(); + auto expected_ck = cksum_filter->expected(*s->info.env); auto cksum_verify = cksum_filter->verify(*s->info.env); // valid or no supplied cksum cksum = get<1>(cksum_verify); - if (std::get<0>(cksum_verify)) { + if ((!expected_ck) || + std::get<0>(cksum_verify)) { buffer::list cksum_bl; ldpp_dout_fmt(this, 16, @@ -4636,14 +4641,13 @@ void RGWPutObj::execute(optional_yield y) "\n\tcomputed={} == \n\texpected={}", hdr.second, cksum->to_armor(), - cksum_filter->expected(*s->info.env)); + (!!expected_ck) ? expected_ck : "(checksum unavailable)"); cksum->encode(cksum_bl); emplace_attr(RGW_ATTR_CKSUM, std::move(cksum_bl)); } else { /* content checksum mismatch */ auto computed_ck = cksum->to_armor(); - auto expected_ck = cksum_filter->expected(*s->info.env); ldpp_dout_fmt(this, 4, "{} content checksum mismatch" @@ -4846,7 +4850,8 @@ void RGWPostObj::execute(optional_yield y) /* optional streaming checksum */ try { cksum_filter = - rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env); + rgw::putobj::RGWPutObj_Cksum::Factory( + filter, *s->info.env, rgw::cksum::Type::none /* no override */); } catch (const rgw::io::Exception& e) { op_ret = -e.code().value(); return; @@ -5194,7 +5199,10 @@ void RGWPutMetadataBucket::execute(optional_yield y) /* Setting attributes also stores the provided bucket info. Due * to this fact, the new quota settings can be serialized with * the same call. */ - op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + s->bucket->set_attrs(attrs); + constexpr bool exclusive = false; // overwrite + constexpr ceph::real_time no_set_mtime{}; + op_ret = s->bucket->put_info(this, exclusive, no_set_mtime, s->yield); return op_ret; }, y); } @@ -5982,8 +5990,6 @@ void RGWGetACLs::execute(optional_yield y) acls = ss.str(); } - - int RGWPutACLs::verify_permission(optional_yield y) { bool perm; @@ -6005,6 +6011,74 @@ int RGWPutACLs::verify_permission(optional_yield y) return 0; } +uint16_t RGWGetObjAttrs::recognize_attrs(const std::string& hdr, uint16_t deflt) +{ + auto attrs{deflt}; + auto sa = ceph::split(hdr, ","); + for (auto& k : sa) { + if (boost::iequals(k, "etag")) { + attrs |= as_flag(ReqAttributes::Etag); + } + if (boost::iequals(k, "checksum")) { + attrs |= as_flag(ReqAttributes::Checksum); + } + if (boost::iequals(k, "objectparts")) { + attrs |= as_flag(ReqAttributes::ObjectParts); + } + if (boost::iequals(k, "objectsize")) { + attrs |= as_flag(ReqAttributes::ObjectSize); + } + if (boost::iequals(k, "storageclass")) { + attrs |= as_flag(ReqAttributes::StorageClass); + } + } + return attrs; +} /* RGWGetObjAttrs::recognize_attrs */ + +int RGWGetObjAttrs::verify_permission(optional_yield y) +{ + bool perm = false; + auto [has_s3_existing_tag, has_s3_resource_tag] = + rgw_check_policy_condition(this, s); + + if (! rgw::sal::Object::empty(s->object.get())) { + + auto iam_action1 = s->object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion; + + auto iam_action2 = s->object->get_instance().empty() ? + rgw::IAM::s3GetObjectAttributes : + rgw::IAM::s3GetObjectVersionAttributes; + + if (has_s3_existing_tag || has_s3_resource_tag) { + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + } + + /* XXXX the following conjunction should be &&--but iam_action2 is currently not + * hooked up and always fails (but should succeed if the requestor has READ + * acess to the object) */ + perm = (verify_object_permission(this, s, iam_action1) || /* && */ + verify_object_permission(this, s, iam_action2)); + } + + if (! perm) { + return -EACCES; + } + + return 0; +} + +void RGWGetObjAttrs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjAttrs::execute(optional_yield y) +{ + RGWGetObj::execute(y); +} /* RGWGetObjAttrs::execute */ + int RGWGetLC::verify_permission(optional_yield y) { auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); @@ -6672,6 +6746,14 @@ try_sum_part_cksums(const DoutPrefixProvider *dpp, ++parts_ix; auto& part_cksum = part.second->get_cksum(); + if (! part_cksum) { + ldpp_dout_fmt(dpp, 0, + "ERROR: multipart part checksum not present (ix=={})", + parts_ix); + op_ret = -ERR_INVALID_REQUEST; + return op_ret; + } + ldpp_dout_fmt(dpp, 16, "INFO: {} iterate part: {} {} {}", __func__, parts_ix, part_cksum->type_string(), @@ -8500,6 +8582,10 @@ void RGWGetBucketPolicy::execute(optional_yield y) void RGWDeleteBucketPolicy::send_response() { + if (!op_ret) { + /* A successful Delete Bucket Policy should return a 204 on success */ + op_ret = STATUS_NO_CONTENT; + } if (op_ret) { set_req_state_err(s, op_ret); } @@ -9175,4 +9261,3 @@ void rgw_slo_entry::decode_json(JSONObj *obj) JSONDecoder::decode_json("etag", etag, obj); JSONDecoder::decode_json("size_bytes", size_bytes, obj); }; - diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 9f747501729..dcf64c31572 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -12,6 +12,7 @@ #pragma once +#include <cstdint> #include <limits.h> #include <array> @@ -1111,6 +1112,7 @@ class RGWCreateBucket : public RGWOp { bool relaxed_region_enforcement = false; RGWCORSConfiguration cors_config; std::set<std::string> rmattr_names; + bufferlist in_data; virtual bool need_metadata_upload() const { return false; } @@ -1237,6 +1239,7 @@ protected: std::string multipart_upload_id; std::string multipart_part_str; int multipart_part_num = 0; + rgw::cksum::Type multipart_cksum_type{rgw::cksum::Type::none}; jspan_ptr multipart_trace; boost::optional<ceph::real_time> delete_at; @@ -1644,6 +1647,50 @@ public: uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } }; +class RGWGetObjAttrs : public RGWGetObj { +protected: + std::string version_id; + std::string expected_bucket_owner; + std::optional<int> marker; + std::optional<int> max_parts; + uint16_t requested_attributes{0}; +#if 0 + /* used to decrypt attributes for objects stored with SSE-C */ + x-amz-server-side-encryption-customer-algorithm + x-amz-server-side-encryption-customer-key + x-amz-server-side-encryption-customer-key-MD5 +#endif +public: + + enum class ReqAttributes : uint16_t { + None = 0, + Etag, + Checksum, + ObjectParts, + StorageClass, + ObjectSize + }; + + static uint16_t as_flag(ReqAttributes attr) { + return 1 << (uint16_t(attr) ? uint16_t(attr) - 1 : 0); + } + + static uint16_t recognize_attrs(const std::string& hdr, uint16_t deflt = 0); + + RGWGetObjAttrs() : RGWGetObj() + { + RGWGetObj::get_data = false; // it's extra false + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + void send_response() override = 0; + const char* name() const override { return "get_obj_attrs"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_ATTRS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; /* RGWGetObjAttrs */ + class RGWGetLC : public RGWOp { protected: diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h index 49faea6403d..2c8225d289e 100644 --- a/src/rgw/rgw_op_type.h +++ b/src/rgw/rgw_op_type.h @@ -30,6 +30,7 @@ enum RGWOpType { RGW_OP_COPY_OBJ, RGW_OP_GET_ACLS, RGW_OP_PUT_ACLS, + RGW_OP_GET_OBJ_ATTRS, RGW_OP_GET_CORS, RGW_OP_PUT_CORS, RGW_OP_DELETE_CORS, diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h index aa33080af56..9111696453e 100644 --- a/src/rgw/rgw_rest.h +++ b/src/rgw/rgw_rest.h @@ -403,6 +403,17 @@ public: virtual std::string canonical_name() const override { return fmt::format("REST.{}.ACL", s->info.method); } }; +class RGWGetObjAttrs_ObjStore : public RGWGetObjAttrs { +public: + RGWGetObjAttrs_ObjStore() {} + ~RGWGetObjAttrs_ObjStore() override {} + + int get_params(optional_yield y) = 0; + /* not actually used */ + int send_response_data_error(optional_yield y) override { return 0; }; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override { return 0; }; +}; + class RGWGetLC_ObjStore : public RGWGetLC { public: RGWGetLC_ObjStore() {} diff --git a/src/rgw/rgw_rest_bucket_logging.cc b/src/rgw/rgw_rest_bucket_logging.cc index ed12ce855a9..afd79b0a548 100644 --- a/src/rgw/rgw_rest_bucket_logging.cc +++ b/src/rgw/rgw_rest_bucket_logging.cc @@ -58,30 +58,29 @@ public: return; } - std::unique_ptr<rgw::sal::Bucket> bucket; - op_ret = driver->load_bucket(this, rgw_bucket(s->bucket_tenant, s->bucket_name), - &bucket, y); + const rgw_bucket src_bucket_id(s->bucket_tenant, s->bucket_name); + std::unique_ptr<rgw::sal::Bucket> src_bucket; + op_ret = driver->load_bucket(this, src_bucket_id, + &src_bucket, y); if (op_ret < 0) { - ldpp_dout(this, 1) << "ERROR: failed to get bucket '" << - (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << - "' info, ret = " << op_ret << dendl; + ldpp_dout(this, 1) << "ERROR: failed to get bucket '" << src_bucket_id << "', ret = " << op_ret << dendl; return; } - if (auto iter = bucket->get_attrs().find(RGW_ATTR_BUCKET_LOGGING); iter != bucket->get_attrs().end()) { + if (auto iter = src_bucket->get_attrs().find(RGW_ATTR_BUCKET_LOGGING); iter != src_bucket->get_attrs().end()) { try { configuration.enabled = true; decode(configuration, iter->second); } catch (buffer::error& err) { - ldpp_dout(this, 1) << "ERROR: failed to decode attribute '" << RGW_ATTR_BUCKET_LOGGING - << "'. error: " << err.what() << dendl; + ldpp_dout(this, 1) << "WARNING: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING + << "' for bucket '" << src_bucket_id << "', error: " << err.what() << dendl; op_ret = -EIO; return; } } else { - ldpp_dout(this, 5) << "WARNING: no logging configuration on bucket '" << bucket->get_name() << "'" << dendl; + ldpp_dout(this, 5) << "WARNING: no logging configuration on bucket '" << src_bucket_id << "'" << dendl; return; } - ldpp_dout(this, 20) << "INFO: found logging configuration on bucket '" << bucket->get_name() << "'" + ldpp_dout(this, 20) << "INFO: found logging configuration on bucket '" << src_bucket_id << "'" << "'. configuration: " << configuration.to_json_str() << dendl; } @@ -159,58 +158,125 @@ class RGWPutBucketLoggingOp : public RGWDefaultResponseOp { return; } - std::unique_ptr<rgw::sal::Bucket> bucket; - op_ret = driver->load_bucket(this, rgw_bucket(s->bucket_tenant, s->bucket_name), - &bucket, y); + const rgw_bucket src_bucket_id(s->bucket_tenant, s->bucket_name); + std::unique_ptr<rgw::sal::Bucket> src_bucket; + op_ret = driver->load_bucket(this, src_bucket_id, + &src_bucket, y); if (op_ret < 0) { - ldpp_dout(this, 1) << "ERROR: failed to get bucket '" << s->bucket_name << "', ret = " << op_ret << dendl; + ldpp_dout(this, 1) << "ERROR: failed to get bucket '" << src_bucket_id << "', ret = " << op_ret << dendl; return; } - - auto& attrs = bucket->get_attrs(); if (!configuration.enabled) { - if (auto iter = attrs.find(RGW_ATTR_BUCKET_LOGGING); iter != attrs.end()) { - attrs.erase(iter); - } - } else { - std::unique_ptr<rgw::sal::Bucket> target_bucket; - op_ret = driver->load_bucket(this, rgw_bucket(s->bucket_tenant, configuration.target_bucket), - &target_bucket, y); - if (op_ret < 0) { - ldpp_dout(this, 1) << "ERROR: failed to get target bucket '" << configuration.target_bucket << "', ret = " << op_ret << dendl; - return; - } - const auto& target_attrs = target_bucket->get_attrs(); - if (target_attrs.find(RGW_ATTR_BUCKET_LOGGING) != target_attrs.end()) { - // target bucket must not have logging set on it - ldpp_dout(this, 1) << "ERROR: logging target bucket '" << configuration.target_bucket << "', is configured with bucket logging" << dendl; - op_ret = -EINVAL; - return; - } - // TODO: verify target bucket does not have encryption - bufferlist conf_bl; - encode(configuration, conf_bl); - attrs[RGW_ATTR_BUCKET_LOGGING] = conf_bl; - // TODO: should we add attribute to target bucket indicating it is target to bucket logging? - // if we do, how do we maintain it when bucket logging changes? + op_ret = rgw::bucketlogging::source_bucket_cleanup(this, driver, src_bucket.get(), true, y); + return; + } + + // set logging configuration + rgw_bucket target_bucket_id; + if (op_ret = rgw::bucketlogging::get_bucket_id(configuration.target_bucket, s->bucket_tenant, target_bucket_id); op_ret < 0) { + ldpp_dout(this, 1) << "ERROR: failed to parse target bucket '" << configuration.target_bucket << "', ret = " << op_ret << dendl; + return; + } + + if (target_bucket_id == src_bucket_id) { + ldpp_dout(this, 1) << "ERROR: target bucket '" << target_bucket_id << "' must be different from source bucket" << dendl; + op_ret = -EINVAL; + return; + } + std::unique_ptr<rgw::sal::Bucket> target_bucket; + op_ret = driver->load_bucket(this, target_bucket_id, + &target_bucket, y); + if (op_ret < 0) { + ldpp_dout(this, 1) << "ERROR: failed to get target bucket '" << target_bucket_id << "', ret = " << op_ret << dendl; + return; + } + auto& target_attrs = target_bucket->get_attrs(); + if (target_attrs.find(RGW_ATTR_BUCKET_LOGGING) != target_attrs.end()) { + // target bucket must not have logging set on it + ldpp_dout(this, 1) << "ERROR: logging target bucket '" << target_bucket_id << "', is configured with bucket logging" << dendl; + op_ret = -EINVAL; + return; } - // TODO: use retry_raced_bucket_write from rgw_op.cc - op_ret = bucket->merge_and_store_attrs(this, attrs, y); + // verify target bucket does not have encryption + if (target_attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_POLICY) != target_attrs.end()) { + ldpp_dout(this, 1) << "ERROR: logging target bucket '" << target_bucket_id << "', is configured with encryption" << dendl; + op_ret = -EINVAL; + return; + } + std::optional<rgw::bucketlogging::configuration> old_conf; + bufferlist conf_bl; + encode(configuration, conf_bl); + op_ret = retry_raced_bucket_write(this, src_bucket.get(), [this, &conf_bl, &src_bucket, &old_conf, &configuration, y] { + auto& attrs = src_bucket->get_attrs(); + auto it = attrs.find(RGW_ATTR_BUCKET_LOGGING); + if (it != attrs.end()) { + try { + rgw::bucketlogging::configuration tmp_conf; + tmp_conf.enabled = true; + decode(tmp_conf, it->second); + old_conf = std::move(tmp_conf); + } catch (buffer::error& err) { + ldpp_dout(this, 1) << "WARNING: failed to decode existing logging attribute '" << RGW_ATTR_BUCKET_LOGGING + << "' for bucket '" << src_bucket->get_info().bucket << "', error: " << err.what() << dendl; + } + if (!old_conf || (old_conf && *old_conf != configuration)) { + // conf changed (or was unknown) - update + it->second = conf_bl; + return src_bucket->merge_and_store_attrs(this, attrs, y); + } + // nothing to update + return 0; + } + // conf was added + attrs.insert(std::make_pair(RGW_ATTR_BUCKET_LOGGING, conf_bl)); + return src_bucket->merge_and_store_attrs(this, attrs, y); + }, y); if (op_ret < 0) { ldpp_dout(this, 1) << "ERROR: failed to set logging attribute '" << RGW_ATTR_BUCKET_LOGGING << "' to bucket '" << - bucket->get_name() << "', ret = " << op_ret << dendl; + src_bucket_id << "', ret = " << op_ret << dendl; return; } - - ldpp_dout(this, 20) << "INFO: " << (configuration.enabled ? "wrote" : "removed") - << " logging configuration. bucket '" << bucket->get_name() << "'. configuration: " << - configuration.to_json_str() << dendl; + if (!old_conf) { + ldpp_dout(this, 20) << "INFO: new logging configuration added to bucket '" << src_bucket_id << "'. configuration: " << + configuration.to_json_str() << dendl; + if (const auto ret = rgw::bucketlogging::update_bucket_logging_sources(this, target_bucket, src_bucket_id, true, y); ret < 0) { + ldpp_dout(this, 1) << "WARNING: failed to add source bucket '" << src_bucket_id << "' to logging sources of target bucket '" << + target_bucket_id << "', ret = " << ret << dendl; + } + } else if (*old_conf != configuration) { + // conf changed - do cleanup + if (const auto ret = commit_logging_object(*old_conf, target_bucket, this, y); ret < 0) { + ldpp_dout(this, 1) << "WARNING: could not commit pending logging object when updating logging configuration of bucket '" << + src_bucket->get_info().bucket << "', ret = " << ret << dendl; + } else { + ldpp_dout(this, 20) << "INFO: committed pending logging object when updating logging configuration of bucket '" << + src_bucket->get_info().bucket << "'" << dendl; + } + if (old_conf->target_bucket != configuration.target_bucket) { + rgw_bucket old_target_bucket_id; + if (const auto ret = rgw::bucketlogging::get_bucket_id(old_conf->target_bucket, s->bucket_tenant, old_target_bucket_id); ret < 0) { + ldpp_dout(this, 1) << "ERROR: failed to parse target bucket '" << old_conf->target_bucket << "', ret = " << ret << dendl; + return; + } + if (const auto ret = rgw::bucketlogging::update_bucket_logging_sources(this, driver, old_target_bucket_id, src_bucket_id, false, y); ret < 0) { + ldpp_dout(this, 1) << "WARNING: failed to remove source bucket '" << src_bucket_id << "' from logging sources of original target bucket '" << + old_target_bucket_id << "', ret = " << ret << dendl; + } + if (const auto ret = rgw::bucketlogging::update_bucket_logging_sources(this, target_bucket, src_bucket_id, true, y); ret < 0) { + ldpp_dout(this, 1) << "WARNING: failed to add source bucket '" << src_bucket_id << "' to logging sources of target bucket '" << + target_bucket_id << "', ret = " << ret << dendl; + } + } + ldpp_dout(this, 20) << "INFO: wrote logging configuration to bucket '" << src_bucket_id << "'. configuration: " << + configuration.to_json_str() << dendl; + } else { + ldpp_dout(this, 20) << "INFO: logging configuration of bucket '" << src_bucket_id << "' did not change" << dendl; + } } }; // Post /<bucket name>/?logging -// actual configuration is XML encoded in the body of the message class RGWPostBucketLoggingOp : public RGWDefaultResponseOp { int verify_permission(optional_yield y) override { auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); @@ -234,17 +300,18 @@ class RGWPostBucketLoggingOp : public RGWDefaultResponseOp { return; } - std::unique_ptr<rgw::sal::Bucket> bucket; - op_ret = driver->load_bucket(this, rgw_bucket(s->bucket_tenant, s->bucket_name), - &bucket, y); + const rgw_bucket src_bucket_id(s->bucket_tenant, s->bucket_name); + std::unique_ptr<rgw::sal::Bucket> src_bucket; + op_ret = driver->load_bucket(this, src_bucket_id, + &src_bucket, y); if (op_ret < 0) { - ldpp_dout(this, 1) << "ERROR: failed to get bucket '" << s->bucket_name << "', ret = " << op_ret << dendl; + ldpp_dout(this, 1) << "ERROR: failed to get bucket '" << src_bucket_id << "', ret = " << op_ret << dendl; return; } - const auto& bucket_attrs = bucket->get_attrs(); + const auto& bucket_attrs = src_bucket->get_attrs(); auto iter = bucket_attrs.find(RGW_ATTR_BUCKET_LOGGING); if (iter == bucket_attrs.end()) { - ldpp_dout(this, 1) << "WARNING: no logging configured on bucket" << dendl; + ldpp_dout(this, 1) << "WARNING: no logging configured on bucket '" << src_bucket_id << "'" << dendl; return; } rgw::bucketlogging::configuration configuration; @@ -252,33 +319,38 @@ class RGWPostBucketLoggingOp : public RGWDefaultResponseOp { configuration.enabled = true; decode(configuration, iter->second); } catch (buffer::error& err) { - ldpp_dout(this, 1) << "ERROR: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING - << "'. error: " << err.what() << dendl; + ldpp_dout(this, 1) << "WARNING: failed to decode logging attribute '" << RGW_ATTR_BUCKET_LOGGING + << "' for bucket '" << src_bucket_id << "', error: " << err.what() << dendl; op_ret = -EINVAL; return; } + rgw_bucket target_bucket_id; + if (op_ret = rgw::bucketlogging::get_bucket_id(configuration.target_bucket, s->bucket_tenant, target_bucket_id); op_ret < 0) { + ldpp_dout(this, 1) << "ERROR: failed to parse target bucket '" << configuration.target_bucket << "', ret = " << op_ret << dendl; + return; + } std::unique_ptr<rgw::sal::Bucket> target_bucket; - op_ret = driver->load_bucket(this, rgw_bucket(s->bucket_tenant, configuration.target_bucket), + op_ret = driver->load_bucket(this, target_bucket_id, &target_bucket, y); if (op_ret < 0) { - ldpp_dout(this, 1) << "ERROR: failed to get target bucket '" << configuration.target_bucket << "', ret = " << op_ret << dendl; + ldpp_dout(this, 1) << "ERROR: failed to get target bucket '" << target_bucket_id << "', ret = " << op_ret << dendl; return; } std::string obj_name; RGWObjVersionTracker objv_tracker; op_ret = target_bucket->get_logging_object_name(obj_name, configuration.target_prefix, null_yield, this, &objv_tracker); if (op_ret < 0) { - ldpp_dout(this, 1) << "ERROR: failed to get pending logging object name from target bucket '" << configuration.target_bucket << "'" << dendl; + ldpp_dout(this, 1) << "ERROR: failed to get pending logging object name from target bucket '" << target_bucket_id << "'" << dendl; return; } op_ret = rgw::bucketlogging::rollover_logging_object(configuration, target_bucket, obj_name, this, null_yield, true, &objv_tracker); if (op_ret < 0) { ldpp_dout(this, 1) << "ERROR: failed to flush pending logging object '" << obj_name - << "' to target bucket '" << configuration.target_bucket << "'" << dendl; + << "' to target bucket '" << target_bucket_id << "'" << dendl; return; } - ldpp_dout(this, 20) << "flushed pending logging object '" << obj_name + ldpp_dout(this, 20) << "INFO: flushed pending logging object '" << obj_name << "' to target bucket '" << configuration.target_bucket << "'" << dendl; } }; diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc index adfc86d87cb..f1ffe09cf25 100644 --- a/src/rgw/rgw_rest_pubsub.cc +++ b/src/rgw/rgw_rest_pubsub.cc @@ -234,7 +234,13 @@ bool verify_topic_permission(const DoutPrefixProvider* dpp, req_state* s, return verify_topic_permission(dpp, s, topic.owner, arn, policy, op); } -// command (AWS compliant): +bool should_forward_request_to_master(req_state* s, rgw::sal::Driver* driver) { + return (!driver->is_meta_master() && + rgw::all_zonegroups_support(*s->penv.site, + rgw::zone_features::notification_v2)); +} + +// command (AWS compliant): // POST // Action=CreateTopic&Name=<topic-name>[&OpaqueData=data][&push-endpoint=<endpoint>[&persistent][&<arg1>=<value1>]] class RGWPSCreateTopicOp : public RGWOp { @@ -273,7 +279,7 @@ class RGWPSCreateTopicOp : public RGWOp { // Remove the args that are parsed, so the push_endpoint_args only contains // necessary one's which is parsed after this if. but only if master zone, // else we do not remove as request is forwarded to master. - if (driver->is_meta_master()) { + if (!should_forward_request_to_master(s, driver)) { s->info.args.remove("OpaqueData"); s->info.args.remove("push-endpoint"); s->info.args.remove("persistent"); @@ -396,7 +402,7 @@ class RGWPSCreateTopicOp : public RGWOp { void RGWPSCreateTopicOp::execute(optional_yield y) { // master request will replicate the topic creation. - if (!driver->is_meta_master()) { + if (should_forward_request_to_master(s, driver)) { op_ret = rgw_forward_request_to_master( this, *s->penv.site, s->owner.id, &bl_post_body, nullptr, s->info, y); if (op_ret < 0) { @@ -863,7 +869,7 @@ class RGWPSSetTopicAttributesOp : public RGWOp { }; void RGWPSSetTopicAttributesOp::execute(optional_yield y) { - if (!driver->is_meta_master()) { + if (should_forward_request_to_master(s, driver)) { op_ret = rgw_forward_request_to_master( this, *s->penv.site, s->owner.id, &bl_post_body, nullptr, s->info, y); if (op_ret < 0) { @@ -1008,9 +1014,10 @@ class RGWPSDeleteTopicOp : public RGWOp { }; void RGWPSDeleteTopicOp::execute(optional_yield y) { - if (!driver->is_meta_master()) { + if (should_forward_request_to_master(s, driver)) { op_ret = rgw_forward_request_to_master( this, *s->penv.site, s->owner.id, &bl_post_body, nullptr, s->info, y); + if (op_ret < 0) { ldpp_dout(this, 1) << "DeleteTopic forward_request_to_master returned ret = " << op_ret @@ -1260,7 +1267,7 @@ int RGWPSCreateNotifOp::verify_permission(optional_yield y) { } void RGWPSCreateNotifOp::execute(optional_yield y) { - if (!driver->is_meta_master()) { + if (should_forward_request_to_master(s, driver)) { op_ret = rgw_forward_request_to_master( this, *s->penv.site, s->owner.id, &data, nullptr, s->info, y); if (op_ret < 0) { @@ -1462,7 +1469,7 @@ int RGWPSDeleteNotifOp::verify_permission(optional_yield y) { } void RGWPSDeleteNotifOp::execute(optional_yield y) { - if (!driver->is_meta_master()) { + if (should_forward_request_to_master(s, driver)) { bufferlist indata; op_ret = rgw_forward_request_to_master( this, *s->penv.site, s->owner.id, &indata, nullptr, s->info, y); diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 30ebe8e8965..885991244a6 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -9,6 +9,7 @@ #include <string_view> #include "common/ceph_crypto.h" +#include "common/dout.h" #include "common/split.h" #include "common/Formatter.h" #include "common/utf8.h" @@ -807,7 +808,6 @@ void RGWGetObjTags_ObjStore_S3::send_response_data(bufferlist& bl) } } - int RGWPutObjTags_ObjStore_S3::get_params(optional_yield y) { RGWXMLParser parser; @@ -2533,6 +2533,10 @@ int RGWCreateBucket_ObjStore_S3::get_params(optional_yield y) if ((op_ret < 0) && (op_ret != -ERR_LENGTH_REQUIRED)) return op_ret; + if (!driver->is_meta_master()) { + in_data.append(data); + } + if (data.length()) { RGWCreateBucketParser parser; @@ -3815,6 +3819,196 @@ void RGWPutACLs_ObjStore_S3::send_response() dump_start(s); } +int RGWGetObjAttrs_ObjStore_S3::get_params(optional_yield y) +{ + string err; + auto& env = s->info.env; + version_id = s->info.args.get("versionId"); + + auto hdr = env->get_optional("HTTP_X_AMZ_EXPECTED_BUCKET_OWNER"); + if (hdr) { + expected_bucket_owner = *hdr; + } + + hdr = env->get_optional("HTTP_X_AMZ_MAX_PARTS"); + if (hdr) { + max_parts = strict_strtol(hdr->c_str(), 10, &err); + if (!err.empty()) { + s->err.message = "Invalid value for MaxParts: " + err; + ldpp_dout(s, 10) << "Invalid value for MaxParts " << *hdr << ": " + << err << dendl; + return -ERR_INVALID_PART; + } + max_parts = std::min(*max_parts, 1000); + } + + hdr = env->get_optional("HTTP_X_AMZ_PART_NUMBER_MARKER"); + if (hdr) { + marker = strict_strtol(hdr->c_str(), 10, &err); + if (!err.empty()) { + s->err.message = "Invalid value for PartNumberMarker: " + err; + ldpp_dout(s, 10) << "Invalid value for PartNumberMarker " << *hdr << ": " + << err << dendl; + return -ERR_INVALID_PART; + } + } + + hdr = env->get_optional("HTTP_X_AMZ_OBJECT_ATTRIBUTES"); + if (hdr) { + requested_attributes = recognize_attrs(*hdr); + } + + /* XXX skipping SSE-C params for now */ + + return 0; +} /* RGWGetObjAttrs_ObjStore_S3::get_params(...) */ + +int RGWGetObjAttrs_ObjStore_S3::get_decrypt_filter( + std::unique_ptr<RGWGetObj_Filter> *filter, + RGWGetObj_Filter* cb, bufferlist* manifest_bl) +{ + // we aren't actually decrypting the data, but for objects encrypted with + // SSE-C we do need to verify that required headers are present and valid + // + // in the SSE-KMS and SSE-S3 cases, this unfortunately causes us to fetch + // decryption keys which we don't need :( + std::unique_ptr<BlockCrypt> block_crypt; // ignored + std::map<std::string, std::string> crypt_http_responses; // ignored + return rgw_s3_prepare_decrypt(s, s->yield, attrs, &block_crypt, + crypt_http_responses); +} + +void RGWGetObjAttrs_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + + if (op_ret == 0) { + version_id = s->object->get_instance(); + + // x-amz-delete-marker: DeleteMarker // not sure we can plausibly do this? + dump_last_modified(s, lastmod); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + // x-amz-request-charged: RequestCharged + } + + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret == 0) { + s->formatter->open_object_section("GetObjectAttributes"); + if (requested_attributes & as_flag(ReqAttributes::Etag)) { + if (lo_etag.empty()) { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + lo_etag = iter->second.to_str(); + } + } + s->formatter->dump_string("ETag", lo_etag); + } + + if (requested_attributes & as_flag(ReqAttributes::Checksum)) { + s->formatter->open_object_section("Checksum"); + auto iter = attrs.find(RGW_ATTR_CKSUM); + if (iter != attrs.end()) { + try { + rgw::cksum::Cksum cksum; + auto bliter = iter->second.cbegin(); + cksum.decode(bliter); + if (multipart_parts_count && multipart_parts_count > 0) { + s->formatter->dump_string(cksum.element_name(), + fmt::format("{}-{}", cksum.to_armor(), *multipart_parts_count)); + } else { + s->formatter->dump_string(cksum.element_name(), cksum.to_armor()); + } + } catch (buffer::error& err) { + ldpp_dout(this, 0) + << "ERROR: could not decode stored cksum, caught buffer::error" << dendl; + } + } + s->formatter->close_section(); /* Checksum */ + } /* Checksum */ + + if (requested_attributes & as_flag(ReqAttributes::ObjectParts)) { + if (multipart_parts_count && multipart_parts_count > 0) { + + /* XXX the following was needed to see a manifest at list_parts()! */ + op_ret = s->object->load_obj_state(s, s->yield); + if (op_ret < 0) { + ldpp_dout_fmt(this, 0, + "ERROR: {} load_obj_state() failed ret={}", __func__, + op_ret); + } + + ldpp_dout_fmt(this, 16, + "{} attr flags={} parts_count={}", + __func__, requested_attributes, *multipart_parts_count); + + s->formatter->open_object_section("ObjectParts"); + + bool truncated = false; + int next_marker; + + using namespace rgw::sal; + + int ret = + s->object->list_parts( + this, s->cct, + max_parts ? *max_parts : 1000, + marker ? *marker : 0, + &next_marker, &truncated, + [&](const Object::Part& part) -> int { + s->formatter->open_object_section("Part"); + s->formatter->dump_int("PartNumber", part.part_number); + s->formatter->dump_unsigned("Size", part.part_size); + if (part.cksum.type != rgw::cksum::Type::none) { + s->formatter->dump_string(part.cksum.element_name(), part.cksum.to_armor()); + } + s->formatter->close_section(); /* Part */ + return 0; + }, s->yield); + + if (ret < 0) { + ldpp_dout_fmt(this, 0, + "ERROR: {} list-parts failed for {}", + __func__, s->object->get_name()); + } + /* AWS docs disagree on the name of this element */ + s->formatter->dump_int("PartsCount", *multipart_parts_count); + s->formatter->dump_int("TotalPartsCount", *multipart_parts_count); + s->formatter->dump_bool("IsTruncated", truncated); + if (max_parts) { + s->formatter->dump_int("MaxParts", *max_parts); + } + if(truncated) { + s->formatter->dump_int("NextPartNumberMarker", next_marker); + } + if (marker) { + s->formatter->dump_int("PartNumberMarker", *marker); + } + s->formatter->close_section(); + } /* multipart_parts_count positive */ + } /* ObjectParts */ + + if (requested_attributes & as_flag(ReqAttributes::ObjectSize)) { + s->formatter->dump_int("ObjectSize", s->obj_size); + } + + if (requested_attributes & as_flag(ReqAttributes::StorageClass)) { + auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != attrs.end()) { + s->formatter->dump_string("StorageClass", iter->second.to_str()); + } else { + s->formatter->dump_string("StorageClass", "STANDARD"); + } + } + s->formatter->close_section(); + } /* op_ret == 0 */ + + rgw_flush_formatter_and_reset(s, s->formatter); +} /* RGWGetObjAttrs_ObjStore_S3::send_response */ + void RGWGetLC_ObjStore_S3::execute(optional_yield y) { config.set_ctx(s->cct); @@ -4794,6 +4988,7 @@ RGWOp *RGWHandler_REST_Bucket_S3::get_obj_op(bool get_data) const RGWOp *RGWHandler_REST_Bucket_S3::op_get() { + /* XXX maybe we could replace this with an indexing operation */ if (s->info.args.sub_resource_exists("encryption")) return nullptr; @@ -4990,6 +5185,8 @@ RGWOp *RGWHandler_REST_Obj_S3::op_get() return new RGWGetObjLayout_ObjStore_S3; } else if (is_tagging_op()) { return new RGWGetObjTags_ObjStore_S3; + } else if (is_attributes_op()) { + return new RGWGetObjAttrs_ObjStore_S3; } else if (is_obj_retention_op()) { return new RGWGetObjRetention_ObjStore_S3; } else if (is_obj_legal_hold_op()) { @@ -6535,7 +6732,7 @@ rgw::auth::s3::LocalEngine::authenticate( /* Ignore signature for HTTP OPTIONS */ if (s->op_type == RGW_OP_OPTIONS_CORS) { auto apl = apl_factory->create_apl_local( - cct, s, user->get_info(), std::move(account), std::move(policies), + cct, s, std::move(user), std::move(account), std::move(policies), k.subuser, std::nullopt, access_key_id); return result_t::grant(std::move(apl), completer_factory(k.key)); } @@ -6556,7 +6753,7 @@ rgw::auth::s3::LocalEngine::authenticate( } auto apl = apl_factory->create_apl_local( - cct, s, user->get_info(), std::move(account), std::move(policies), + cct, s, std::move(user), std::move(account), std::move(policies), k.subuser, std::nullopt, access_key_id); return result_t::grant(std::move(apl), completer_factory(k.key)); } @@ -6765,7 +6962,7 @@ rgw::auth::s3::STSEngine::authenticate( string subuser; auto apl = local_apl_factory->create_apl_local( - cct, s, user->get_info(), std::move(account), std::move(policies), + cct, s, std::move(user), std::move(account), std::move(policies), subuser, token.perm_mask, std::string(_access_key_id)); return result_t::grant(std::move(apl), completer_factory(token.secret_access_key)); } diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h index 50160d79a42..e8fdc69751c 100644 --- a/src/rgw/rgw_rest_s3.h +++ b/src/rgw/rgw_rest_s3.h @@ -374,6 +374,18 @@ public: int get_params(optional_yield y) override; }; +class RGWGetObjAttrs_ObjStore_S3 : public RGWGetObjAttrs_ObjStore { +public: + RGWGetObjAttrs_ObjStore_S3() {} + ~RGWGetObjAttrs_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter, + RGWGetObj_Filter* cb, + bufferlist* manifest_bl) override; + void send_response() override; +}; + class RGWGetLC_ObjStore_S3 : public RGWGetLC_ObjStore { protected: RGWLifecycleConfiguration_S3 config; @@ -701,6 +713,9 @@ protected: bool is_acl_op() const { return s->info.args.exists("acl"); } + bool is_attributes_op() const { + return s->info.args.exists("attributes"); + } bool is_cors_op() const { return s->info.args.exists("cors"); } @@ -759,6 +774,9 @@ protected: bool is_acl_op() const { return s->info.args.exists("acl"); } + bool is_attributes_op() const { + return s->info.args.exists("attributes"); + } bool is_tagging_op() const { return s->info.args.exists("tagging"); } diff --git a/src/rgw/rgw_s3_filter.h b/src/rgw/rgw_s3_filter.h index 9bbc4ef0088..0273da9a364 100644 --- a/src/rgw/rgw_s3_filter.h +++ b/src/rgw/rgw_s3_filter.h @@ -9,6 +9,7 @@ class XMLObj; struct rgw_s3_key_filter { + bool operator==(const rgw_s3_key_filter& rhs) const = default; std::string prefix_rule; std::string suffix_rule; std::string regex_rule; diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h index e098c4decf7..97e25179fc9 100644 --- a/src/rgw/rgw_sal.h +++ b/src/rgw/rgw_sal.h @@ -15,6 +15,7 @@ #pragma once +#include <cstdint> #include <optional> #include <boost/intrusive_ptr.hpp> #include <boost/smart_ptr/intrusive_ref_counter.hpp> @@ -26,6 +27,7 @@ #include "rgw_notify_event_type.h" #include "rgw_req_context.h" #include "include/random.h" +#include "include/function2.hpp" // FIXME: following subclass dependencies #include "driver/rados/rgw_user.h" @@ -1004,20 +1006,27 @@ class Bucket { optional_yield y, const DoutPrefixProvider *dpp) = 0; /** Read the name of the pending bucket logging object name */ - virtual int get_logging_object_name(std::string& obj_name, - const std::string& prefix, - optional_yield y, + virtual int get_logging_object_name(std::string& obj_name, + const std::string& prefix, + optional_yield y, const DoutPrefixProvider *dpp, RGWObjVersionTracker* objv_tracker) = 0; /** Update the name of the pending bucket logging object name */ - virtual int set_logging_object_name(const std::string& obj_name, - const std::string& prefix, - optional_yield y, - const DoutPrefixProvider *dpp, + virtual int set_logging_object_name(const std::string& obj_name, + const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, bool new_obj, RGWObjVersionTracker* objv_tracker) = 0; + /** Remove the object holding the name of the pending bucket logging object */ + virtual int remove_logging_object_name(const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWObjVersionTracker* objv_tracker) = 0; /** Move the pending bucket logging object into the bucket */ virtual int commit_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) = 0; + //** Remove the pending bucket logging object */ + virtual int remove_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) = 0; /** Write a record to the pending bucket logging object */ virtual int write_logging_object(const std::string& obj_name, const std::string& record, optional_yield y, const DoutPrefixProvider *dpp, bool async_completion) = 0; @@ -1169,6 +1178,9 @@ class Object { std::string* version_id, std::string* tag, std::string* etag, void (*progress_cb)(off_t, void *), void* progress_data, const DoutPrefixProvider* dpp, optional_yield y) = 0; + + /** return logging subsystem */ + virtual unsigned get_subsys() { return ceph_subsys_rgw; }; /** Get the ACL for this object */ virtual RGWAccessControlPolicy& get_acl(void) = 0; /** Set the ACL for this object */ @@ -1249,6 +1261,28 @@ class Object { /** Dump driver-specific object layout info in JSON */ virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) = 0; + /* A transfer data type describing metadata specific to one part of a + * completed multipart upload object, following the GetObjectAttributes + * response syntax for Object::Parts here: + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObjectAttributes.html */ + class Part + { + public: + int part_number; + uint32_t part_size; + rgw::cksum::Cksum cksum; + }; /* Part */ + + /* callback function/object used by list_parts */ + using list_parts_each_t = + const fu2::unique_function<int(const Part&) const>; + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) = 0; + /** Get the cached attributes for this object */ virtual Attrs& get_attrs(void) = 0; /** Get the (const) cached attributes for this object */ @@ -1447,7 +1481,7 @@ public: virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) = 0; /** List all the parts of this upload, filling the parts cache */ virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, - int num_parts, int marker, + int max_parts, int marker, int* next_marker, bool* truncated, optional_yield y, bool assume_unsorted = false) = 0; /** Abort this upload */ @@ -1751,8 +1785,6 @@ class Zone { virtual bool is_writeable() = 0; /** Get the URL for the endpoint for redirecting to this zone */ virtual bool get_redirect_endpoint(std::string* endpoint) = 0; - /** Check to see if the given API is supported in this zone */ - virtual bool has_zonegroup_api(const std::string& api) const = 0; /** Get the current period ID for this zone */ virtual const std::string& get_current_period_id() = 0; /** Get thes system access key for this zone */ diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc index 0e4f95846d1..02fd7a49cda 100644 --- a/src/rgw/rgw_sal_dbstore.cc +++ b/src/rgw/rgw_sal_dbstore.cc @@ -458,14 +458,6 @@ namespace rgw::sal { return false; } - bool DBZone::has_zonegroup_api(const std::string& api) const - { - if (api == "default") - return true; - - return false; - } - const std::string& DBZone::get_current_period_id() { return current_period->get_id(); @@ -496,6 +488,14 @@ namespace rgw::sal { return std::make_unique<DBLuaManager>(this); } + int DBObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) + { + return -EOPNOTSUPP; + } + int DBObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh) { RGWObjState* astate; diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h index b54249df031..4df10d1dce1 100644 --- a/src/rgw/rgw_sal_dbstore.h +++ b/src/rgw/rgw_sal_dbstore.h @@ -303,7 +303,6 @@ protected: virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() override; virtual const std::string& get_realm_name() override; @@ -529,6 +528,7 @@ protected: DBObject(DBObject& _o) = default; + virtual unsigned get_subsys() { return ceph_subsys_rgw_dbstore; }; virtual int delete_object(const DoutPrefixProvider* dpp, optional_yield y, uint32_t flags, @@ -554,6 +554,13 @@ protected: virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override; + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override; virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc index 733bfa39ee2..15da580988e 100644 --- a/src/rgw/rgw_sal_filter.cc +++ b/src/rgw/rgw_sal_filter.cc @@ -1046,6 +1046,17 @@ RGWAccessControlPolicy& FilterObject::get_acl() return next->get_acl(); } +int FilterObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) +{ + return next->list_parts(dpp, cct, max_parts, marker, next_marker, + truncated, + sal::Object::list_parts_each_t(each_func), + y); +} + int FilterObject::load_obj_state(const DoutPrefixProvider *dpp, optional_yield y, bool follow_olh) { return next->load_obj_state(dpp, y, follow_olh); diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h index 43a440e8b10..b6b6ed42b8f 100644 --- a/src/rgw/rgw_sal_filter.h +++ b/src/rgw/rgw_sal_filter.h @@ -108,9 +108,6 @@ public: virtual bool get_redirect_endpoint(std::string* endpoint) override { return next->get_redirect_endpoint(endpoint); } - virtual bool has_zonegroup_api(const std::string& api) const override { - return next->has_zonegroup_api(api); - } virtual const std::string& get_current_period_id() override { return next->get_current_period_id(); } @@ -669,24 +666,33 @@ public: optional_yield y, const DoutPrefixProvider *dpp) override { return next->remove_topics(objv_tracker, y, dpp); } - int get_logging_object_name(std::string& obj_name, - const std::string& prefix, - optional_yield y, + int get_logging_object_name(std::string& obj_name, + const std::string& prefix, + optional_yield y, const DoutPrefixProvider *dpp, RGWObjVersionTracker* objv_tracker) override { return next->get_logging_object_name(obj_name, prefix, y, dpp, objv_tracker); } - int set_logging_object_name(const std::string& obj_name, - const std::string& prefix, - optional_yield y, - const DoutPrefixProvider *dpp, + int set_logging_object_name(const std::string& obj_name, + const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, bool new_obj, RGWObjVersionTracker* objv_track) override { - return next->set_logging_object_name(obj_name, prefix, y, dpp, new_obj, objv_track); + return next->set_logging_object_name(obj_name, prefix, y, dpp, new_obj, objv_track); + } + int remove_logging_object_name(const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWObjVersionTracker* objv_tracker) override { + return next->remove_logging_object_name(prefix, y, dpp, objv_tracker); } int commit_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp)override { return next->commit_logging_object(obj_name, y, dpp); } + int remove_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) override { + return next->remove_logging_object(obj_name, y, dpp); + } int write_logging_object(const std::string& obj_name, const std::string& record, optional_yield y, const DoutPrefixProvider *dpp, bool async_completion) override { return next->write_logging_object(obj_name, record, y, dpp, async_completion); } @@ -781,6 +787,12 @@ public: virtual bool empty() const override { return next->empty(); } virtual const std::string &get_name() const override { return next->get_name(); } + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int load_obj_state(const DoutPrefixProvider *dpp, optional_yield y, bool follow_olh = true) override; virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h index 5cb98d23158..99b90564997 100644 --- a/src/rgw/rgw_sal_store.h +++ b/src/rgw/rgw_sal_store.h @@ -253,18 +253,23 @@ class StoreBucket : public Bucket { optional_yield y, const DoutPrefixProvider *dpp) override {return 0;} int remove_topics(RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override {return 0;} - int get_logging_object_name(std::string& obj_name, - const std::string& prefix, - optional_yield y, + int get_logging_object_name(std::string& obj_name, + const std::string& prefix, + optional_yield y, const DoutPrefixProvider *dpp, RGWObjVersionTracker* objv_tracker) override { return 0; } - int set_logging_object_name(const std::string& obj_name, - const std::string& prefix, - optional_yield y, - const DoutPrefixProvider *dpp, + int set_logging_object_name(const std::string& obj_name, + const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, bool new_obj, RGWObjVersionTracker* objv_tracker) override { return 0; } + int remove_logging_object_name(const std::string& prefix, + optional_yield y, + const DoutPrefixProvider *dpp, + RGWObjVersionTracker* objv_tracker) override { return 0; } int commit_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) override { return 0; } + int remove_logging_object(const std::string& obj_name, optional_yield y, const DoutPrefixProvider *dpp) override { return 0; } int write_logging_object(const std::string& obj_name, const std::string& record, optional_yield y, const DoutPrefixProvider *dpp, bool async_completion) override { return 0; } diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc index 032b3734bf9..937f74601b3 100644 --- a/src/rgw/rgw_swift_auth.cc +++ b/src/rgw/rgw_swift_auth.cc @@ -522,7 +522,7 @@ ExternalTokenEngine::authenticate(const DoutPrefixProvider* dpp, } auto apl = apl_factory->create_apl_local( - cct, s, user->get_info(), std::move(account), + cct, s, std::move(user), std::move(account), std::move(policies), extract_swift_subuser(swift_user), std::nullopt, LocalApplier::NO_ACCESS_KEY); return result_t::grant(std::move(apl)); @@ -685,7 +685,7 @@ SignedTokenEngine::authenticate(const DoutPrefixProvider* dpp, } auto apl = apl_factory->create_apl_local( - cct, s, user->get_info(), std::move(account), + cct, s, std::move(user), std::move(account), std::move(policies), extract_swift_subuser(swift_user), std::nullopt, LocalApplier::NO_ACCESS_KEY); return result_t::grant(std::move(apl)); diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h index 9049c54f5ca..c27a24a2619 100644 --- a/src/rgw/rgw_swift_auth.h +++ b/src/rgw/rgw_swift_auth.h @@ -23,8 +23,8 @@ namespace swift { class TempURLApplier : public rgw::auth::LocalApplier { public: TempURLApplier(CephContext* const cct, - const RGWUserInfo& user_info) - : LocalApplier(cct, user_info, std::nullopt, {}, LocalApplier::NO_SUBUSER, + std::unique_ptr<rgw::sal::User> user) + : LocalApplier(cct, std::move(user), std::nullopt, {}, LocalApplier::NO_SUBUSER, std::nullopt, LocalApplier::NO_ACCESS_KEY) {} @@ -155,8 +155,8 @@ public: class SwiftAnonymousApplier : public rgw::auth::LocalApplier { public: SwiftAnonymousApplier(CephContext* const cct, - const RGWUserInfo& user_info) - : LocalApplier(cct, user_info, std::nullopt, {}, LocalApplier::NO_SUBUSER, + std::unique_ptr<rgw::sal::User> user) + : LocalApplier(cct, std::move(user), std::nullopt, {}, LocalApplier::NO_SUBUSER, std::nullopt, LocalApplier::NO_ACCESS_KEY) { } bool is_admin_of(const rgw_owner& o) const {return false;} @@ -238,7 +238,7 @@ class DefaultStrategy : public rgw::auth::Strategy, aplptr_t create_apl_local(CephContext* const cct, const req_state* const s, - const RGWUserInfo& user_info, + std::unique_ptr<rgw::sal::User> user, std::optional<RGWAccountInfo> account, std::vector<IAM::Policy> policies, const std::string& subuser, @@ -247,7 +247,7 @@ class DefaultStrategy : public rgw::auth::Strategy, auto apl = \ rgw::auth::add_3rdparty(driver, rgw_user(s->account_name), rgw::auth::add_sysreq(cct, driver, s, - LocalApplier(cct, user_info, std::move(account), std::move(policies), + LocalApplier(cct, std::move(user), std::move(account), std::move(policies), subuser, perm_mask, access_key_id))); /* TODO(rzarzynski): replace with static_ptr. */ return aplptr_t(new decltype(apl)(std::move(apl))); @@ -259,7 +259,9 @@ class DefaultStrategy : public rgw::auth::Strategy, /* TempURL doesn't need any user account override. It's a Swift-specific * mechanism that requires account name internally, so there is no * business with delegating the responsibility outside. */ - return aplptr_t(new rgw::auth::swift::TempURLApplier(cct, user_info)); + std::unique_ptr<rgw::sal::User> user = s->user->clone(); + user->get_info() = user_info; + return aplptr_t(new rgw::auth::swift::TempURLApplier(cct, std::move(user))); } public: diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc index 70cf40eb6cb..97d81550058 100644 --- a/src/rgw/services/svc_zone.cc +++ b/src/rgw/services/svc_zone.cc @@ -657,18 +657,6 @@ const string& RGWSI_Zone::get_current_period_id() const return current_period->get_id(); } -bool RGWSI_Zone::has_zonegroup_api(const std::string& api) const -{ - if (!current_period->get_id().empty()) { - const auto& zonegroups_by_api = current_period->get_map().zonegroups_by_api; - if (zonegroups_by_api.find(api) != zonegroups_by_api.end()) - return true; - } else if (zonegroup->api_name == api) { - return true; - } - return false; -} - bool RGWSI_Zone::zone_is_writeable() { return writeable_zone && !get_zone().is_read_only(); @@ -743,8 +731,7 @@ bool RGWSI_Zone::is_meta_master() const bool RGWSI_Zone::need_to_log_metadata() const { - return is_meta_master() && - (zonegroup->zones.size() > 1 || current_period->is_multi_zonegroups_with_zones()); + return is_meta_master() && is_syncing_bucket_meta(); } bool RGWSI_Zone::can_reshard() const @@ -761,33 +748,16 @@ bool RGWSI_Zone::can_reshard() const /** * Check to see if the bucket metadata could be synced - * bucket: the bucket to check * Returns false is the bucket is not synced */ -bool RGWSI_Zone::is_syncing_bucket_meta(const rgw_bucket& bucket) +bool RGWSI_Zone::is_syncing_bucket_meta() const { - /* no current period */ if (current_period->get_id().empty()) { return false; } - /* zonegroup is not master zonegroup */ - if (!zonegroup->is_master_zonegroup()) { - return false; - } - - /* single zonegroup and a single zone */ - if (current_period->is_single_zonegroup() && zonegroup->zones.size() == 1) { - return false; - } - - /* zone is not master */ - if (zonegroup->master_zone != zone_public_config->id) { - return false; - } - - return true; + return zonegroup->zones.size() > 1 || current_period->is_multi_zonegroups_with_zones(); } diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h index c4a3a28f0d7..719546eb8db 100644 --- a/src/rgw/services/svc_zone.h +++ b/src/rgw/services/svc_zone.h @@ -96,7 +96,6 @@ public: uint32_t get_zone_short_id() const; const std::string& get_current_period_id() const; - bool has_zonegroup_api(const std::string& api) const; bool zone_is_writeable(); bool zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const; @@ -146,7 +145,7 @@ public: bool need_to_log_data() const; bool need_to_log_metadata() const; bool can_reshard() const; - bool is_syncing_bucket_meta(const rgw_bucket& bucket); + bool is_syncing_bucket_meta() const; int list_zonegroups(const DoutPrefixProvider *dpp, std::list<std::string>& zonegroups); int list_regions(const DoutPrefixProvider *dpp, std::list<std::string>& regions); diff --git a/src/script/ceph-backport.sh b/src/script/ceph-backport.sh index a56509e3d3a..c216ed32d9b 100755 --- a/src/script/ceph-backport.sh +++ b/src/script/ceph-backport.sh @@ -779,7 +779,7 @@ function maybe_deduce_remote { else assert_fail "bad remote_type ->$remote_type<- in maybe_deduce_remote" fi - remote=$(git remote -v | grep --extended-regexp --ignore-case '(://|@)github.com(/|:|:/)'${url_component}'/ceph(\s|\.|\/)' | head -n1 | cut -f 1) + remote=$(git remote -v | grep --extended-regexp --ignore-case '(://|@)github.com(/|:|:/)'${url_component}'/ceph(\s|\.|\/|-)' | head -n1 | cut -f 1) echo "$remote" } diff --git a/src/script/run-make.sh b/src/script/run-make.sh index 52d43d3a171..23724028fe6 100755 --- a/src/script/run-make.sh +++ b/src/script/run-make.sh @@ -29,6 +29,7 @@ function clean_up_after_myself() { function detect_ceph_dev_pkgs() { local boost_root=/opt/ceph + local cmake_opts="" if test -f $boost_root/include/boost/config.hpp; then cmake_opts+=" -DWITH_SYSTEM_BOOST=ON -DBOOST_ROOT=$boost_root" else diff --git a/src/test/ObjectMap/KeyValueDBMemory.cc b/src/test/ObjectMap/KeyValueDBMemory.cc index 234e963397e..cfe25930d6a 100644 --- a/src/test/ObjectMap/KeyValueDBMemory.cc +++ b/src/test/ObjectMap/KeyValueDBMemory.cc @@ -132,12 +132,26 @@ public: return ""; } + string_view key_as_sv() override { + if (valid()) + return (*it).first.second; + else + return ""; + } + pair<string,string> raw_key() override { if (valid()) return (*it).first; else return make_pair("", ""); } + + pair<string_view,string_view> raw_key_as_sv() override { + if (valid()) + return (*it).first; + else + return make_pair("", ""); + } bool raw_key_is_prefixed(const string &prefix) override { return prefix == (*it).first.first; @@ -150,6 +164,13 @@ public: return bufferlist(); } + std::string_view value_as_sv() override { + if (valid()) + return std::string_view{it->second.c_str(), it->second.length()}; + else + return std::string_view(); + } + int status() override { return 0; } diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t index cb45a9883c3..c1675d11a80 100644 --- a/src/test/cli/radosgw-admin/help.t +++ b/src/test/cli/radosgw-admin/help.t @@ -43,7 +43,8 @@ bucket sync disable disable bucket sync bucket sync enable enable bucket sync bucket radoslist list rados objects backing bucket's objects - bucket logging flush flush pending log records object of source bucket to the log bucket to bucket + bucket logging flush flush pending log records object of source bucket to the log bucket + bucket logging info get info on bucket logging configuration on source bucket or list of sources in log bucket bi get retrieve bucket index object entries bi put store bucket index object entries bi list list raw bucket index entries @@ -226,6 +227,7 @@ --secret/--secret-key=<key> specify secret key --gen-access-key generate random access key (for S3) --gen-secret generate random secret key + --generate-key create user with or without credentials --key-type=<type> key type, options are: swift, s3 --key-active=<bool> activate or deactivate a key --temp-url-key[-2]=<key> temp url key diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t index 984175a97b9..5f304258358 100644 --- a/src/test/cli/rbd/help.t +++ b/src/test/cli/rbd/help.t @@ -916,7 +916,7 @@ [--group-namespace <group-namespace>] [--group <group>] [--image-pool <image-pool>] [--image-namespace <image-namespace>] - [--image <image>] [--pool <pool>] + [--image <image>] <group-spec> <image-spec> Add an image to a group. @@ -934,7 +934,6 @@ --image-pool arg image pool name --image-namespace arg image namespace name --image arg image name - -p [ --pool ] arg pool name unless overridden rbd help group image list usage: rbd group image list [--format <format>] [--pretty-format] @@ -960,8 +959,7 @@ [--group-namespace <group-namespace>] [--group <group>] [--image-pool <image-pool>] [--image-namespace <image-namespace>] - [--image <image>] [--pool <pool>] - [--image-id <image-id>] + [--image <image>] [--image-id <image-id>] <group-spec> <image-spec> Remove an image from a group. @@ -979,7 +977,6 @@ --image-pool arg image pool name --image-namespace arg image namespace name --image arg image name - -p [ --pool ] arg pool name unless overridden --image-id arg image id rbd help group info diff --git a/src/test/crimson/seastore/test_block.h b/src/test/crimson/seastore/test_block.h index e1fe8e06f8a..546f357dea0 100644 --- a/src/test/crimson/seastore/test_block.h +++ b/src/test/crimson/seastore/test_block.h @@ -39,8 +39,8 @@ struct test_block_delta_t { inline std::ostream &operator<<( std::ostream &lhs, const test_extent_desc_t &rhs) { - return lhs << "test_extent_desc_t(len=" << rhs.len - << ", checksum=" << rhs.checksum << ")"; + return lhs << "test_extent_desc_t(len=0x" << std::hex << rhs.len + << ", checksum=0x" << rhs.checksum << std::dec << ")"; } struct TestBlock : crimson::os::seastore::LogicalCachedExtent { diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 9988df3a124..7874411e0ff 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -112,14 +112,22 @@ struct btree_test_base : seastar::future<> submit_transaction(TransactionRef t) { auto record = cache->prepare_record(*t, JOURNAL_SEQ_NULL, JOURNAL_SEQ_NULL); - return journal->submit_record(std::move(record), t->get_handle()).safe_then( - [this, t=std::move(t)](auto submit_result) mutable { - cache->complete_commit( - *t, + return seastar::do_with( + std::move(t), [this, record=std::move(record)](auto& _t) mutable { + auto& t = *_t; + return journal->submit_record( + std::move(record), + t.get_handle(), + t.get_src(), + [this, &t](auto submit_result) { + cache->complete_commit( + t, submit_result.record_block_base, submit_result.write_result.start_seq); - complete_commit(*t); - }).handle_error(crimson::ct_error::assert_all{}); + complete_commit(t); + } + ).handle_error(crimson::ct_error::assert_all{}); + }); } virtual LBAManager::mkfs_ret test_structure_setup(Transaction &t) = 0; @@ -149,7 +157,10 @@ struct btree_test_base : }).safe_then([this] { return seastar::do_with( cache->create_transaction( - Transaction::src_t::MUTATE, "test_set_up_fut", false), + Transaction::src_t::MUTATE, + "test_set_up_fut", + CACHE_HINT_TOUCH, + false), [this](auto &ref_t) { return with_trans_intr(*ref_t, [&](auto &t) { cache->init(); @@ -228,7 +239,10 @@ struct lba_btree_test : btree_test_base { template <typename F> auto lba_btree_update(F &&f) { auto tref = cache->create_transaction( - Transaction::src_t::MUTATE, "test_btree_update", false); + Transaction::src_t::MUTATE, + "test_btree_update", + CACHE_HINT_TOUCH, + false); auto &t = *tref; with_trans_intr( t, @@ -273,7 +287,10 @@ struct lba_btree_test : btree_test_base { template <typename F> auto lba_btree_read(F &&f) { auto t = cache->create_transaction( - Transaction::src_t::READ, "test_btree_read", false); + Transaction::src_t::READ, + "test_btree_read", + CACHE_HINT_TOUCH, + false); return with_trans_intr( *t, [this, f=std::forward<F>(f)](auto &t) mutable { @@ -421,7 +438,10 @@ struct btree_lba_manager_test : btree_test_base { auto create_transaction(bool create_fake_extent=true) { auto t = test_transaction_t{ cache->create_transaction( - Transaction::src_t::MUTATE, "test_mutate_lba", false), + Transaction::src_t::MUTATE, + "test_mutate_lba", + CACHE_HINT_TOUCH, + false), test_lba_mappings }; if (create_fake_extent) { @@ -437,7 +457,10 @@ struct btree_lba_manager_test : btree_test_base { auto create_weak_transaction() { auto t = test_transaction_t{ cache->create_transaction( - Transaction::src_t::READ, "test_read_weak", true), + Transaction::src_t::READ, + "test_read_weak", + CACHE_HINT_TOUCH, + true), test_lba_mappings }; return t; diff --git a/src/test/crimson/seastore/test_cbjournal.cc b/src/test/crimson/seastore/test_cbjournal.cc index d00a0f42729..47a08d68cbb 100644 --- a/src/test/crimson/seastore/test_cbjournal.cc +++ b/src/test/crimson/seastore/test_cbjournal.cc @@ -181,15 +181,20 @@ struct cbjournal_test_t : public seastar_test_suite_t, JournalTrimmer auto submit_record(record_t&& record) { entries.push_back(record); + entry_validator_t& back = entries.back(); OrderingHandle handle = get_dummy_ordering_handle(); - auto [addr, w_result] = cbj->submit_record( - std::move(record), - handle).unsafe_get(); - entries.back().seq = w_result.start_seq; - entries.back().entries = 1; - entries.back().magic = cbj->get_cjs().get_cbj_header().magic; - logger().debug("submit entry to addr {}", entries.back().seq); - return convert_paddr_to_abs_addr(entries.back().seq.offset); + cbj->submit_record( + std::move(record), + handle, + transaction_type_t::MUTATE, + [this, &back](auto locator) { + back.seq = locator.write_result.start_seq; + back.entries = 1; + back.magic = cbj->get_cjs().get_cbj_header().magic; + logger().debug("submit entry to addr {}", back.seq); + } + ).unsafe_get(); + return convert_paddr_to_abs_addr(back.seq.offset); } seastar::future<> tear_down_fut() final { diff --git a/src/test/crimson/seastore/test_seastore_cache.cc b/src/test/crimson/seastore/test_seastore_cache.cc index 6e24f436b98..fa774886139 100644 --- a/src/test/crimson/seastore/test_seastore_cache.cc +++ b/src/test/crimson/seastore/test_seastore_cache.cc @@ -87,7 +87,10 @@ struct cache_test_t : public seastar_test_suite_t { auto get_transaction() { return cache->create_transaction( - Transaction::src_t::MUTATE, "test_cache", false); + Transaction::src_t::MUTATE, + "test_cache", + CACHE_HINT_TOUCH, + false); } template <typename T, typename... Args> diff --git a/src/test/crimson/seastore/test_seastore_journal.cc b/src/test/crimson/seastore/test_seastore_journal.cc index 2eb791b1d46..04a99319b11 100644 --- a/src/test/crimson/seastore/test_seastore_journal.cc +++ b/src/test/crimson/seastore/test_seastore_journal.cc @@ -233,12 +233,17 @@ struct journal_test_t : seastar_test_suite_t, SegmentProvider, JournalTrimmer { auto submit_record(T&&... _record) { auto record{std::forward<T>(_record)...}; records.push_back(record); + record_validator_t& back = records.back(); OrderingHandle handle = get_dummy_ordering_handle(); - auto [addr, _] = journal->submit_record( + journal->submit_record( std::move(record), - handle).unsafe_get(); - records.back().record_final_offset = addr; - return addr; + handle, + transaction_type_t::MUTATE, + [&back](auto locator) { + back.record_final_offset = locator.record_block_base; + } + ).unsafe_get(); + return back.record_final_offset; } extent_t generate_extent(size_t blocks) { diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 7e058c80ed6..e0fc5821d08 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -119,6 +119,11 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener { events_to_dispatch.emplace_back(event.intrusive_from_this()); } + template <class EventT> + void schedule_event_immediate(const EventT& event) { + events_to_dispatch.emplace_front(event.intrusive_from_this()); + } + // BackfillListener { void request_replica_scan( const pg_shard_t& target, @@ -188,12 +193,11 @@ public: struct PGFacade; void cancel() { - events_to_dispatch.clear(); - schedule_event(crimson::osd::BackfillState::CancelBackfill{}); + schedule_event_immediate(crimson::osd::BackfillState::CancelBackfill{}); } void resume() { - schedule_event(crimson::osd::BackfillState::Triggered{}); + schedule_event_immediate(crimson::osd::BackfillState::Triggered{}); } }; @@ -274,6 +278,9 @@ struct BackfillFixture::PGFacade : public crimson::osd::BackfillState::PGFacade return backfill_source.projected_log; } + std::ostream &print(std::ostream &out) const override { + return out << "FakePGFacade"; + } }; BackfillFixture::BackfillFixture( @@ -452,7 +459,69 @@ TEST(backfill, two_empty_replicas) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } -TEST(backfill, cancel_resume) +TEST(backfill, cancel_resume_middle_of_primaryscan) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_replicascan1) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_replicascan2) { const auto reference_store = FakeStore{ { { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, @@ -469,12 +538,43 @@ TEST(backfill, cancel_resume) EXPECT_CALL(cluster_fixture, backfilled); cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.cancel(); cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); cluster_fixture.resume(); cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_push1) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); @@ -483,7 +583,7 @@ TEST(backfill, cancel_resume) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } -TEST(backfill, cancel_resume_middle_of_scan) +TEST(backfill, cancel_resume_middle_of_push2) { const auto reference_store = FakeStore{ { { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, @@ -501,14 +601,46 @@ TEST(backfill, cancel_resume_middle_of_scan) EXPECT_CALL(cluster_fixture, backfilled); cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.cancel(); cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); cluster_fixture.resume(); cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_push3) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::RequestDone>(); cluster_fixture.next_till_done(); EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc index 68587fe87d1..7fb90bdd38e 100644 --- a/src/test/librados/aio.cc +++ b/src/test/librados/aio.cc @@ -1722,3 +1722,59 @@ TEST(LibRadosAioEC, MultiWrite) { rados_aio_release(my_completion2); rados_aio_release(my_completion3); } + +TEST(LibRadosAio, CancelBeforeSubmit) { + AioTestData test_data; + ASSERT_EQ("", test_data.init()); + + rados_completion_t completion; + ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion)); + + ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion)); + rados_aio_release(completion); +} + +TEST(LibRadosAio, CancelBeforeComplete) { + AioTestData test_data; + ASSERT_EQ("", test_data.init()); + + // cancellation tests are racy, so retry if completion beats the cancellation + int ret = 0; + int tries = 10; + do { + rados_completion_t completion; + ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion)); + char buf[128]; + ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent", + completion, buf, sizeof(buf), 0)); + + ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion)); + { + TestAlarm alarm; + ASSERT_EQ(0, rados_aio_wait_for_complete(completion)); + } + ret = rados_aio_get_return_value(completion); + rados_aio_release(completion); + } while (ret == -ENOENT && --tries); + + ASSERT_EQ(-ECANCELED, ret); +} + +TEST(LibRadosAio, CancelAfterComplete) { + AioTestData test_data; + rados_completion_t completion; + ASSERT_EQ("", test_data.init()); + + ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion)); + char buf[128]; + ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent", + completion, buf, sizeof(buf), 0)); + + { + TestAlarm alarm; + ASSERT_EQ(0, rados_aio_wait_for_complete(completion)); + } + ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion)); + ASSERT_EQ(-ENOENT, rados_aio_get_return_value(completion)); + rados_aio_release(completion); +} diff --git a/src/test/librados/aio_cxx.cc b/src/test/librados/aio_cxx.cc index a70af050d70..5e35869b5c2 100644 --- a/src/test/librados/aio_cxx.cc +++ b/src/test/librados/aio_cxx.cc @@ -2467,3 +2467,92 @@ TEST(LibRadosAio, MultiReads) { ASSERT_EQ(0, memcmp(buf, bl.c_str(), sizeof(buf))); } } + +// cancellation test fixture for global setup/teardown +// parameterized to test both IoCtx::aio_cancel() and AioCompletion::cancel() +class Cancel : public ::testing::TestWithParam<bool> { + static constexpr auto pool_prefix = "ceph_test_rados_api_pp"; + static Rados rados; + static std::string pool_name; + protected: + static IoCtx ioctx; + public: + static void SetUpTestCase() { + pool_name = get_temp_pool_name(pool_prefix); + ASSERT_EQ("", create_one_pool_pp(pool_name, rados)); + ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx)); + } + static void TearDownTestCase() { + destroy_one_pool_pp(pool_name, rados); + } +}; +Rados Cancel::rados; +std::string Cancel::pool_name; +IoCtx Cancel::ioctx; + +TEST_P(Cancel, BeforeSubmit) +{ + const bool use_completion = GetParam(); + + auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()}; + if (use_completion) { + ASSERT_EQ(0, c->cancel()); + } else { + ASSERT_EQ(0, ioctx.aio_cancel(c.get())); + } +} + +TEST_P(Cancel, BeforeComplete) +{ + const bool use_completion = GetParam(); + + // cancellation tests are racy, so retry if completion beats the cancellation + int ret = 0; + int tries = 10; + do { + auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()}; + ObjectReadOperation op; + op.assert_exists(); + ioctx.aio_operate("nonexistent", c.get(), &op, nullptr); + + if (use_completion) { + EXPECT_EQ(0, c->cancel()); + } else { + EXPECT_EQ(0, ioctx.aio_cancel(c.get())); + } + { + TestAlarm alarm; + ASSERT_EQ(0, c->wait_for_complete()); + } + ret = c->get_return_value(); + } while (ret == -ENOENT && --tries); + + EXPECT_EQ(-ECANCELED, ret); +} + +TEST_P(Cancel, AfterComplete) +{ + const bool use_completion = GetParam(); + + auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()}; + ObjectReadOperation op; + op.assert_exists(); + ioctx.aio_operate("nonexistent", c.get(), &op, nullptr); + { + TestAlarm alarm; + ASSERT_EQ(0, c->wait_for_complete()); + } + if (use_completion) { + EXPECT_EQ(0, c->cancel()); + } else { + EXPECT_EQ(0, ioctx.aio_cancel(c.get())); + } + EXPECT_EQ(-ENOENT, c->get_return_value()); +} + +std::string cancel_test_name(const testing::TestParamInfo<Cancel::ParamType>& info) +{ + return info.param ? "cancel" : "aio_cancel"; +} + +INSTANTIATE_TEST_SUITE_P(LibRadosAio, Cancel, testing::Bool(), cancel_test_name); diff --git a/src/test/librados/asio.cc b/src/test/librados/asio.cc index 01ebb957150..500f36508a7 100644 --- a/src/test/librados/asio.cc +++ b/src/test/librados/asio.cc @@ -21,10 +21,14 @@ #include <boost/range/begin.hpp> #include <boost/range/end.hpp> +#include <boost/asio/bind_cancellation_slot.hpp> +#include <boost/asio/cancellation_signal.hpp> #include <boost/asio/io_context.hpp> #include <boost/asio/spawn.hpp> #include <boost/asio/use_future.hpp> +#include <optional> + #define dout_subsys ceph_subsys_rados #define dout_context g_ceph_context @@ -78,6 +82,15 @@ void rethrow(std::exception_ptr eptr) { if (eptr) std::rethrow_exception(eptr); } +auto capture(std::optional<error_code>& out) { + return [&out] (error_code ec, ...) { out = ec; }; +} + +auto capture(boost::asio::cancellation_signal& signal, + std::optional<error_code>& out) { + return boost::asio::bind_cancellation_slot(signal.slot(), capture(out)); +} + TEST_F(AsioRados, AsyncReadCallback) { boost::asio::io_context service; @@ -385,6 +398,130 @@ TEST_F(AsioRados, AsyncWriteOperationYield) service.run(); } +// FIXME: this crashes on windows with: +// Thread 1 received signal SIGILL, Illegal instruction. +#ifndef _WIN32 + +TEST_F(AsioRados, AsyncReadOperationCancelTerminal) +{ + // cancellation tests are racy, so retry if completion beats the cancellation + boost::system::error_code ec; + int tries = 10; + do { + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> result; + + librados::ObjectReadOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, result)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(result); + + signal.emit(boost::asio::cancellation_type::terminal); + + service.run(); + ASSERT_TRUE(result); + ec = *result; + + signal.emit(boost::asio::cancellation_type::all); // noop + } while (ec == std::errc::no_such_file_or_directory && --tries); + + EXPECT_EQ(ec, boost::asio::error::operation_aborted); +} + +TEST_F(AsioRados, AsyncReadOperationCancelTotal) +{ + // cancellation tests are racy, so retry if completion beats the cancellation + boost::system::error_code ec; + int tries = 10; + do { + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> result; + + librados::ObjectReadOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, result)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(result); + + signal.emit(boost::asio::cancellation_type::total); + + service.run(); + ASSERT_TRUE(result); + ec = *result; + + signal.emit(boost::asio::cancellation_type::all); // noop + } while (ec == std::errc::no_such_file_or_directory && --tries); + + EXPECT_EQ(ec, boost::asio::error::operation_aborted); +} + +TEST_F(AsioRados, AsyncWriteOperationCancelTerminal) +{ + // cancellation tests are racy, so retry if completion beats the cancellation + boost::system::error_code ec; + int tries = 10; + do { + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> result; + + librados::ObjectWriteOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, result)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(result); + + signal.emit(boost::asio::cancellation_type::terminal); + + service.run(); + ASSERT_TRUE(result); + ec = *result; + + signal.emit(boost::asio::cancellation_type::all); // noop + } while (ec == std::errc::no_such_file_or_directory && --tries); + + EXPECT_EQ(ec, boost::asio::error::operation_aborted); +} + +TEST_F(AsioRados, AsyncWriteOperationCancelTotal) +{ + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> ec; + + librados::ObjectWriteOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, ec)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(ec); + + // noop, write only supports terminal + signal.emit(boost::asio::cancellation_type::total); + + service.run(); + ASSERT_TRUE(ec); + EXPECT_EQ(ec, std::errc::no_such_file_or_directory); + + signal.emit(boost::asio::cancellation_type::all); // noop +} + +#endif // not _WIN32 + int main(int argc, char **argv) { auto args = argv_to_vec(argc, argv); diff --git a/src/test/librbd/migration/test_mock_HttpClient.cc b/src/test/librbd/migration/test_mock_HttpClient.cc index f3888755c79..901c4231dd0 100644 --- a/src/test/librbd/migration/test_mock_HttpClient.cc +++ b/src/test/librbd/migration/test_mock_HttpClient.cc @@ -307,7 +307,7 @@ TEST_F(TestMockMigrationHttpClient, OpenCloseHttps) { boost::asio::ssl::context ssl_context{boost::asio::ssl::context::tlsv12}; load_server_certificate(ssl_context); - boost::beast::ssl_stream<boost::beast::tcp_stream> ssl_stream{ + boost::asio::ssl::stream<boost::asio::ip::tcp::socket> ssl_stream{ std::move(socket), ssl_context}; C_SaferCond on_ssl_handshake_ctx; @@ -341,7 +341,7 @@ TEST_F(TestMockMigrationHttpClient, OpenHttpsHandshakeFail) { boost::asio::ssl::context ssl_context{boost::asio::ssl::context::tlsv12}; load_server_certificate(ssl_context); - boost::beast::ssl_stream<boost::beast::tcp_stream> ssl_stream{ + boost::asio::ssl::stream<boost::asio::ip::tcp::socket> ssl_stream{ std::move(socket), ssl_context}; C_SaferCond on_ssl_handshake_ctx; diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc index 008fdcfa7be..8f6cbb9e807 100644 --- a/src/test/librbd/test_internal.cc +++ b/src/test/librbd/test_internal.cc @@ -1571,6 +1571,83 @@ TEST_F(TestInternal, FlattenNoEmptyObjects) rados_ioctx_destroy(d_ioctx); } +TEST_F(TestInternal, FlattenInconsistentObjectMap) +{ + REQUIRE_FEATURE(RBD_FEATURE_LAYERING | RBD_FEATURE_OBJECT_MAP); + REQUIRE(!is_feature_enabled(RBD_FEATURE_STRIPINGV2)); + + librbd::ImageCtx* ictx; + ASSERT_EQ(0, open_image(m_image_name, &ictx)); + + librbd::NoOpProgressContext no_op; + ASSERT_EQ(0, ictx->operations->resize((1 << ictx->order) * 5, true, no_op)); + + bufferlist bl; + bl.append(std::string(256, '1')); + for (int i = 1; i < 5; i++) { + ASSERT_EQ(256, api::Io<>::write(*ictx, (1 << ictx->order) * i, 256, + bufferlist{bl}, 0)); + } + + ASSERT_EQ(0, snap_create(*ictx, "snap")); + ASSERT_EQ(0, snap_protect(*ictx, "snap")); + + uint64_t features; + ASSERT_EQ(0, librbd::get_features(ictx, &features)); + + std::string clone_name = get_temp_image_name(); + int order = ictx->order; + ASSERT_EQ(0, librbd::clone(m_ioctx, m_image_name.c_str(), "snap", m_ioctx, + clone_name.c_str(), features, &order, 0, 0)); + + close_image(ictx); + ASSERT_EQ(0, open_image(clone_name, &ictx)); + + C_SaferCond lock_ctx; + { + std::shared_lock owner_locker{ictx->owner_lock}; + ictx->exclusive_lock->try_acquire_lock(&lock_ctx); + } + ASSERT_EQ(0, lock_ctx.wait()); + ASSERT_TRUE(ictx->exclusive_lock->is_lock_owner()); + + ceph::BitVector<2> inconsistent_object_map; + inconsistent_object_map.resize(5); + inconsistent_object_map[0] = OBJECT_NONEXISTENT; + inconsistent_object_map[1] = OBJECT_NONEXISTENT; + inconsistent_object_map[2] = OBJECT_EXISTS; + inconsistent_object_map[3] = OBJECT_EXISTS_CLEAN; + // OBJECT_PENDING shouldn't happen within parent overlap, but test + // anyway + inconsistent_object_map[4] = OBJECT_PENDING; + + auto object_map = new librbd::ObjectMap<>(*ictx, CEPH_NOSNAP); + C_SaferCond save_ctx; + { + std::shared_lock owner_locker{ictx->owner_lock}; + std::unique_lock image_locker{ictx->image_lock}; + object_map->set_object_map(inconsistent_object_map); + object_map->aio_save(&save_ctx); + } + ASSERT_EQ(0, save_ctx.wait()); + object_map->put(); + + close_image(ictx); + ASSERT_EQ(0, open_image(clone_name, &ictx)); + ASSERT_EQ(0, ictx->operations->flatten(no_op)); + + bufferptr read_ptr(256); + bufferlist read_bl; + read_bl.push_back(read_ptr); + + librbd::io::ReadResult read_result{&read_bl}; + for (int i = 1; i < 5; i++) { + ASSERT_EQ(256, api::Io<>::read(*ictx, (1 << ictx->order) * i, 256, + librbd::io::ReadResult{read_result}, 0)); + EXPECT_TRUE(bl.contents_equal(read_bl)); + } +} + TEST_F(TestInternal, PoolMetadataConfApply) { REQUIRE_FORMAT_V2(); diff --git a/src/test/objectstore/ObjectStoreImitator.h b/src/test/objectstore/ObjectStoreImitator.h index d71d7f2fe58..875f9041b83 100644 --- a/src/test/objectstore/ObjectStoreImitator.h +++ b/src/test/objectstore/ObjectStoreImitator.h @@ -347,6 +347,16 @@ public: ) override { return {}; } + + int omap_iterate(CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + /// [in] where the iterator should point to at the beginning + omap_iter_seek_t start_from, + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override { + return 0; + } + void set_fsid(uuid_d u) override {} uuid_d get_fsid() override { return {}; } uint64_t estimate_objects_overhead(uint64_t num_objects) override { diff --git a/src/test/objectstore/allocsim/ops_replayer.cc b/src/test/objectstore/allocsim/ops_replayer.cc index fd947f5c454..c5908d9f576 100644 --- a/src/test/objectstore/allocsim/ops_replayer.cc +++ b/src/test/objectstore/allocsim/ops_replayer.cc @@ -1,4 +1,5 @@ #include <algorithm> +#include <functional> #include <boost/program_options/value_semantic.hpp> #include <cassert> #include <cctype> @@ -13,26 +14,46 @@ #include <fstream> #include <filesystem> #include <mutex> -#include "include/rados/buffer_fwd.h" -#include "include/rados/librados.hpp" #include <atomic> -#include <fmt/format.h> #include <map> #include <memory> #include <random> #include <string> #include <iostream> #include <vector> +#include <format> + +#include <fmt/format.h> #include <boost/program_options/variables_map.hpp> #include <boost/program_options/parsers.hpp> +#include "include/rados/buffer_fwd.h" +#include "include/rados/librados.hpp" + namespace po = boost::program_options; using namespace std; using namespace ceph; +namespace settings { + +// Returns a function which restricts a value to a specified range by throwing if it is not in range: +// (Note: std::clamp() does not throw.) +auto clamp_or_throw(auto min, auto max) +{ + return [=](auto& x) { + if(std::less<>{}(x, min) or std::greater<>{}(x, max)) { + throw std::out_of_range(fmt::format("value expected between {} and {}, but got {}", min, max, x)); + } + + return x; + }; +} + +} // namespace settings + // compare shared_ptr<string> struct StringPtrCompare { @@ -338,8 +359,8 @@ int main(int argc, char** argv) { // options uint64_t io_depth = 8; - uint64_t nparser_threads = 16; - uint64_t nworker_threads = 16; + int nparser_threads = 16; + int nworker_threads = 16; string file("input.txt"); string ceph_conf_path("./ceph.conf"); string pool("test_pool"); @@ -351,8 +372,8 @@ int main(int argc, char** argv) { ("input-files,i", po::value<vector<string>>()->multitoken(), "List of input files (output of op_scraper.py). Multiple files will be merged and sorted by time order") ("ceph-conf", po::value<string>(&ceph_conf_path)->default_value("ceph.conf"), "Path to ceph conf") ("io-depth", po::value<uint64_t>(&io_depth)->default_value(64), "I/O depth") - ("parser-threads", po::value<uint64_t>(&nparser_threads)->default_value(16), "Number of parser threads") - ("worker-threads", po::value<uint64_t>(&nworker_threads)->default_value(16), "Number of I/O worker threads") + ("parser-threads", po::value<int>(&nparser_threads)->default_value(16)->notifier(settings::clamp_or_throw(1, 256)), "Number of parser threads") + ("worker-threads", po::value<int>(&nworker_threads)->default_value(16)->notifier(settings::clamp_or_throw(1, 256)), "Number of I/O worker threads") ("pool", po::value<string>(&pool)->default_value("test_pool"), "Pool to use for I/O") ("skip-do-ops", po::bool_switch(&skip_do_ops)->default_value(false), "Skip doing operations") ; diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index d3b0d0ac3a4..32173d61afe 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -1426,6 +1426,87 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) { } } +TEST(BlueFS, truncate_drops_allocations) { + constexpr uint64_t K = 1024; + constexpr uint64_t M = 1024 * K; + uuid_d fsid; + const char* DIR_NAME="dir"; + const char* FILE_NAME="file1"; + struct { + uint64_t preallocated_size; + uint64_t write_size; + uint64_t truncate_to; + uint64_t allocated_after_truncate; + uint64_t slow_size = 0; + uint64_t slow_alloc_size = 64*K; + uint64_t db_size = 128*M; + uint64_t db_alloc_size = 1*M; + } scenarios [] = { + // on DB(which is SLOW) : 1 => 1, 64K remains + { 1*M, 1, 1, 64*K }, + // on DB(which is SLOW), alloc 4K : 1 => 1, 4K remains + { 1*M, 1, 1, 4*K, 0, 4*K }, + // on DB(which is SLOW), truncation on AU boundary : 128K => 128K, 128K remains + { 1*M, 128*K, 128*K, 128*K }, + // on DB(which is SLOW), no prealloc, truncation to 0 : 1666K => 0, 0 remains + { 0, 1666*K, 0, 0 }, + // on DB, truncate to 123K, expect 1M occupied + { 1234*K, 123*K, 123*K, 1*M, 128*M, 64*K, 10*M, 1*M }, + // on DB, truncate to 0, expect 0 occupied + { 1234*K, 345*K, 0, 0, 128*M, 64*K, 10*M, 1*M }, + // on DB, truncate to AU boundary, expect exactly 1M occupied + { 1234*K, 1123*K, 1*M, 1*M, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, truncate only data on SLOW + { 0, 10*M+1, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, preallocate and truncate only data on SLOW + { 6*M, 12*M, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, preallocate and truncate all in SLOW and some on DB + // note! prealloc 6M is important, one allocation for 12M will fallback to SLOW + // in 6M + 6M we can be sure that 6M is on DB and 6M is on SLOW + { 6*M, 12*M, 3*M+1, 4*M, 128*M, 64*K, 11*M, 1*M }, + }; + for (auto& s : scenarios) { + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_shared_alloc_size", stringify(s.slow_alloc_size).c_str()); + conf.SetVal("bluefs_alloc_size", stringify(s.db_alloc_size).c_str()); + + g_ceph_context->_conf.set_val("bluefs_shared_alloc_size", stringify(s.slow_alloc_size)); + g_ceph_context->_conf.set_val("bluefs_alloc_size", stringify(s.db_alloc_size)); + TempBdev bdev_db{s.db_size}; + TempBdev bdev_slow{s.slow_size}; + + BlueFS fs(g_ceph_context); + if (s.db_size != 0) { + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0)); + } + if (s.slow_size != 0) { + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0)); + } + + ASSERT_EQ(0, fs.mkfs(fsid, {BlueFS::BDEV_DB, false, false})); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({BlueFS::BDEV_DB, false, false})); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false)); + uint64_t pre = fs.get_used(); + ASSERT_EQ(0, fs.preallocate(h->file, 0, s.preallocated_size)); + const std::string content(s.write_size, 'x'); + h->append(content.c_str(), content.length()); + fs.fsync(h); + ASSERT_EQ(0, fs.truncate(h, s.truncate_to)); + fs.fsync(h); + uint64_t post = fs.get_used(); + fs.close_writer(h); + EXPECT_EQ(pre, post - s.allocated_after_truncate); + + fs.umount(); + } +} + + + + TEST(BlueFS, test_log_runway) { uint64_t max_log_runway = 65536; ConfSaver conf(g_ceph_context->_conf); @@ -1608,6 +1689,91 @@ TEST(BlueFS, test_log_runway_advance_seq) { fs.compact_log(); } +TEST(BlueFS, test_69481_truncate_corrupts_log) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + + BlueFS::FileWriter *f = nullptr; + BlueFS::FileWriter *a = nullptr; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "test-file", &f, false)); + ASSERT_EQ(0, fs.open_for_write("dir", "just-allocate", &a, false)); + + // create 4 distinct extents in file f + // a is here only to prevent f from merging extents together + fs.preallocate(f->file, 0, 0x10000); + fs.preallocate(a->file, 0, 0x10000); + fs.preallocate(f->file, 0, 0x20000); + fs.preallocate(a->file, 0, 0x20000); + fs.preallocate(f->file, 0, 0x30000); + fs.preallocate(a->file, 0, 0x30000); + fs.preallocate(f->file, 0, 0x40000); + fs.preallocate(a->file, 0, 0x40000); + fs.close_writer(a); + + fs.truncate(f, 0); + fs.fsync(f); + + bufferlist bl; + bl.append(std::string(" ", 0x15678)); + f->append(bl); + fs.truncate(f, 0x15678); + fs.fsync(f); + fs.close_writer(f); + + fs.umount(); + // remount to verify + ASSERT_EQ(0, fs.mount()); + fs.umount(); +} + +TEST(BlueFS, test_69481_truncate_asserts) { + uint64_t size = 1048576 * 128; + TempBdev bdev{size}; + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); + + BlueFS::FileWriter *f = nullptr; + BlueFS::FileWriter *a = nullptr; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write("dir", "test-file", &f, false)); + ASSERT_EQ(0, fs.open_for_write("dir", "just-allocate", &a, false)); + + // create 4 distinct extents in file f + // a is here only to prevent f from merging extents together + fs.preallocate(f->file, 0, 0x10000); + fs.preallocate(a->file, 0, 0x10000); + fs.preallocate(f->file, 0, 0x20000); + fs.preallocate(a->file, 0, 0x20000); + fs.preallocate(f->file, 0, 0x30000); + fs.preallocate(a->file, 0, 0x30000); + fs.preallocate(f->file, 0, 0x40000); + fs.preallocate(a->file, 0, 0x40000); + fs.close_writer(a); + + fs.truncate(f, 0); + fs.fsync(f); + + bufferlist bl; + bl.append(std::string(" ", 0x35678)); + f->append(bl); + fs.truncate(f, 0x35678); + fs.fsync(f); + fs.close_writer(f); + + fs.umount(); +} + int main(int argc, char **argv) { auto args = argv_to_vec(argc, argv); map<string,string> defaults = { diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt index f2d1471e22e..798558ebbe0 100644 --- a/src/test/osd/CMakeLists.txt +++ b/src/test/osd/CMakeLists.txt @@ -22,7 +22,7 @@ install(TARGETS add_executable(ceph_test_rados_io_sequence ${CMAKE_CURRENT_SOURCE_DIR}/ceph_test_rados_io_sequence.cc) target_link_libraries(ceph_test_rados_io_sequence - librados global object_io_exerciser) + librados global object_io_exerciser json_structures) install(TARGETS ceph_test_rados_io_sequence DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/src/test/osd/ceph_test_rados_io_sequence.cc b/src/test/osd/ceph_test_rados_io_sequence.cc index 4a768a016e2..96808ea37e5 100644 --- a/src/test/osd/ceph_test_rados_io_sequence.cc +++ b/src/test/osd/ceph_test_rados_io_sequence.cc @@ -1,83 +1,104 @@ #include "ceph_test_rados_io_sequence.h" +#include <boost/asio/io_context.hpp> #include <iostream> #include <vector> -#include <boost/asio/io_context.hpp> - -#include "include/random.h" - -#include "librados/librados_asio.h" -#include "common/ceph_argparse.h" -#include "include/interval_set.h" -#include "global/global_init.h" -#include "global/global_context.h" +#include "common/Formatter.h" #include "common/Thread.h" +#include "common/ceph_argparse.h" +#include "common/ceph_json.h" #include "common/debug.h" #include "common/dout.h" #include "common/split.h" #include "common/strtol.h" // for strict_iecstrtoll() +#include "common/ceph_json.h" +#include "common/Formatter.h" #include "common/io_exerciser/DataGenerator.h" +#include "common/io_exerciser/EcIoSequence.h" +#include "common/io_exerciser/IoOp.h" +#include "common/io_exerciser/IoSequence.h" #include "common/io_exerciser/Model.h" #include "common/io_exerciser/ObjectModel.h" #include "common/io_exerciser/RadosIo.h" -#include "common/io_exerciser/IoOp.h" -#include "common/io_exerciser/IoSequence.h" +#include "common/json/BalancerStructures.h" +#include "common/json/ConfigStructures.h" +#include "common/json/OSDStructures.h" +#include "fmt/format.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/interval_set.h" +#include "include/random.h" +#include "json_spirit/json_spirit.h" +#include "librados/librados_asio.h" #define dout_subsys ceph_subsys_rados #define dout_context g_ceph_context +using OpType = ceph::io_exerciser::OpType; + +using DoneOp = ceph::io_exerciser::DoneOp; +using BarrierOp = ceph::io_exerciser::BarrierOp; +using CreateOp = ceph::io_exerciser::CreateOp; +using RemoveOp = ceph::io_exerciser::RemoveOp; +using SingleReadOp = ceph::io_exerciser::SingleReadOp; +using DoubleReadOp = ceph::io_exerciser::DoubleReadOp; +using TripleReadOp = ceph::io_exerciser::TripleReadOp; +using SingleWriteOp = ceph::io_exerciser::SingleWriteOp; +using DoubleWriteOp = ceph::io_exerciser::DoubleWriteOp; +using TripleWriteOp = ceph::io_exerciser::TripleWriteOp; +using SingleFailedWriteOp = ceph::io_exerciser::SingleFailedWriteOp; +using DoubleFailedWriteOp = ceph::io_exerciser::DoubleFailedWriteOp; +using TripleFailedWriteOp = ceph::io_exerciser::TripleFailedWriteOp; + namespace { - struct Size {}; - void validate(boost::any& v, const std::vector<std::string>& values, - Size *target_type, int) { - po::validators::check_first_occurrence(v); - const std::string &s = po::validators::get_single_string(values); - - std::string parse_error; - uint64_t size = strict_iecstrtoll(s, &parse_error); - if (!parse_error.empty()) { - throw po::validation_error(po::validation_error::invalid_option_value); - } - v = boost::any(size); - } - - struct Pair {}; - void validate(boost::any& v, const std::vector<std::string>& values, - Pair *target_type, int) { - po::validators::check_first_occurrence(v); - const std::string &s = po::validators::get_single_string(values); - auto part = ceph::split(s).begin(); - std::string parse_error; - int first = strict_iecstrtoll(*part++, &parse_error); - int second = strict_iecstrtoll(*part, &parse_error); - if (!parse_error.empty()) { - throw po::validation_error(po::validation_error::invalid_option_value); - } - v = boost::any(std::pair<int,int>{first,second}); - } - - struct PluginString {}; - void validate(boost::any& v, const std::vector<std::string>& values, - PluginString *target_type, int) { - po::validators::check_first_occurrence(v); - const std::string &s = po::validators::get_single_string(values); - - const std::string_view* pluginIt = std::find( - ceph::io_sequence::tester::pluginChoices.begin(), - ceph::io_sequence::tester::pluginChoices.end(), - s - ); - if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt) - { - throw po::validation_error(po::validation_error::invalid_option_value); - } +struct Size {}; +void validate(boost::any& v, const std::vector<std::string>& values, + Size* target_type, int) { + po::validators::check_first_occurrence(v); + const std::string& s = po::validators::get_single_string(values); - v = boost::any(*pluginIt); + std::string parse_error; + uint64_t size = strict_iecstrtoll(s, &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); } + v = boost::any(size); +} + +struct Pair {}; +void validate(boost::any& v, const std::vector<std::string>& values, + Pair* target_type, int) { + po::validators::check_first_occurrence(v); + const std::string& s = po::validators::get_single_string(values); + auto part = ceph::split(s).begin(); + std::string parse_error; + int first = strict_iecstrtoll(*part++, &parse_error); + int second = strict_iecstrtoll(*part, &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + v = boost::any(std::pair<int, int>{first, second}); +} + +struct PluginString {}; +void validate(boost::any& v, const std::vector<std::string>& values, + PluginString* target_type, int) { + po::validators::check_first_occurrence(v); + const std::string& s = po::validators::get_single_string(values); + + const std::string_view* pluginIt = + std::find(ceph::io_sequence::tester::pluginChoices.begin(), + ceph::io_sequence::tester::pluginChoices.end(), s); + if (ceph::io_sequence::tester::pluginChoices.end() == pluginIt) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + + v = boost::any(*pluginIt); +} - constexpr std::string_view usage[] = { +constexpr std::string_view usage[] = { "Basic usage:", "", "ceph_test_rados_io_sequence", @@ -119,103 +140,99 @@ namespace { "\t are specified with unit of blocksize. Supported commands:", "\t\t create <len>", "\t\t remove", - "\t\t read|write <off> <len>", - "\t\t read2|write2 <off> <len> <off> <len>", - "\t\t read3|write3 <off> <len> <off> <len> <off> <len>", - "\t\t done" - }; - - po::options_description get_options_description() - { - po::options_description desc("ceph_test_rados_io options"); - desc.add_options() - ("help,h", - "show help message") - ("listsequence,l", - "show list of sequences") - ("dryrun,d", - "test sequence, do not issue any I/O") - ("verbose", - "more verbose output during test") - ("sequence,s", po::value<int>(), - "test specified sequence") - ("seed", po::value<int>(), - "seed for whole test") - ("seqseed", po::value<int>(), - "seed for sequence") - ("blocksize,b", po::value<Size>(), - "block size (default 2048)") - ("chunksize,c", po::value<Size>(), - "chunk size (default 4096)") - ("pool,p", po::value<std::string>(), - "pool name") - ("object,o", po::value<std::string>()->default_value("test"), - "object name") - ("km", po::value<Pair>(), - "k,m EC pool profile (default 2,2)") - ("plugin", po::value<PluginString>(), - "EC plugin (isa or jerasure)") - ("objectsize", po::value<Pair>(), - "min,max object size in blocks (default 1,32)") - ("threads,t", po::value<int>(), - "number of threads of I/O per object (default 1)") - ("parallel,p", po::value<int>()->default_value(1), - "number of objects to exercise in parallel") - ("interactive", - "interactive mode, execute IO commands from stdin"); - - return desc; - } - - int parse_io_seq_options( - po::variables_map& vm, - int argc, - char** argv) - { - std::vector<std::string> unrecognized_options; - try { - po::options_description desc = get_options_description(); - - auto parsed = po::command_line_parser(argc, argv) - .options(desc) - .allow_unregistered() - .run(); - po::store(parsed, vm); - po::notify(vm); - unrecognized_options = po::collect_unrecognized(parsed.options, - po::include_positional); - - if (!unrecognized_options.empty()) - { - std::stringstream ss; - ss << "Unrecognised command options supplied: "; - while (unrecognized_options.size() > 1) - { - ss << unrecognized_options.back().c_str() << ", "; - unrecognized_options.pop_back(); - } - ss << unrecognized_options.back(); - dout(0) << ss.str() << dendl; - return 1; + "\t\t read|write|failedwrite <off> <len>", + "\t\t read2|write2|failedwrite2 <off> <len> <off> <len>", + "\t\t read3|write3|failedwrite3 <off> <len> <off> <len> <off> <len>", + "\t\t injecterror <type> <shard> <good_count> <fail_count>", + "\t\t clearinject <type> <shard>", + "\t\t done"}; + +po::options_description get_options_description() { + po::options_description desc("ceph_test_rados_io options"); + desc.add_options()("help,h", "show help message")("listsequence,l", + "show list of sequences")( + "dryrun,d", "test sequence, do not issue any I/O")( + "verbose", "more verbose output during test")( + "sequence,s", po::value<int>(), "test specified sequence")( + "seed", po::value<int>(), "seed for whole test")( + "seqseed", po::value<int>(), "seed for sequence")( + "blocksize,b", po::value<Size>(), "block size (default 2048)")( + "chunksize,c", po::value<Size>(), "chunk size (default 4096)")( + "pool,p", po::value<std::string>(), "pool name")( + "object,o", po::value<std::string>()->default_value("test"), + "object name")("km", po::value<Pair>(), + "k,m EC pool profile (default 2,2)")( + "plugin", po::value<PluginString>(), "EC plugin (isa or jerasure)")( + "objectsize", po::value<Pair>(), + "min,max object size in blocks (default 1,32)")( + "threads,t", po::value<int>(), + "number of threads of I/O per object (default 1)")( + "parallel,p", po::value<int>()->default_value(1), + "number of objects to exercise in parallel")( + "testrecovery", + "Inject errors during sequences to test recovery processes of OSDs")( + "interactive", "interactive mode, execute IO commands from stdin")( + "allow_pool_autoscaling", + "Allows pool autoscaling. Disabled by default.")( + "allow_pool_balancer", "Enables pool balancing. Disabled by default.")( + "allow_pool_deep_scrubbing", + "Enables pool deep scrub. Disabled by default.")( + "allow_pool_scrubbing", "Enables pool scrubbing. Disabled by default."); + + return desc; +} + +int parse_io_seq_options(po::variables_map& vm, int argc, char** argv) { + std::vector<std::string> unrecognized_options; + try { + po::options_description desc = get_options_description(); + + auto parsed = po::command_line_parser(argc, argv) + .options(desc) + .allow_unregistered() + .run(); + po::store(parsed, vm); + po::notify(vm); + unrecognized_options = + po::collect_unrecognized(parsed.options, po::include_positional); + + if (!unrecognized_options.empty()) { + std::stringstream ss; + ss << "Unrecognised command options supplied: "; + while (unrecognized_options.size() > 1) { + ss << unrecognized_options.back().c_str() << ", "; + unrecognized_options.pop_back(); } - } catch(const po::error& e) { - std::cerr << "error: " << e.what() << std::endl; + ss << unrecognized_options.back(); + dout(0) << ss.str() << dendl; return 1; } - - return 0; + } catch (const po::error& e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; } + + return 0; } +template <typename S> +int send_mon_command(S& s, librados::Rados& rados, const char* name, + ceph::buffer::list& inbl, ceph::buffer::list* outbl, Formatter* f) { + std::ostringstream oss; + encode_json(name, s, f); + f->flush(oss); + int rc = rados.mon_command(oss.str(), inbl, outbl, nullptr); + return rc; +} + +} // namespace + template <typename T, int N, const std::array<T, N>& Ts> -ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts> - ::ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - const std::string& option_name, - bool set_forced, - bool select_first) - : rng(rng), - option_name(option_name) { +ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>:: + ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, + po::variables_map vm, const std::string& option_name, + bool set_forced, bool select_first) + : rng(rng), option_name(option_name) { if (set_forced && vm.count(option_name)) { force_value = vm[option_name].as<T>(); } @@ -226,76 +243,54 @@ ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts> } template <typename T, int N, const std::array<T, N>& Ts> -bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced() -{ +bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced() { return force_value.has_value(); } template <typename T, int N, const std::array<T, N>& Ts> -const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose() -{ +const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose() { if (force_value.has_value()) { return *force_value; } else if (first_value.has_value()) { return *std::exchange(first_value, std::nullopt); } else { - return choices[rng(N-1)]; + return choices[rng(N - 1)]; } } - - ceph::io_sequence::tester::SelectObjectSize::SelectObjectSize( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "objectsize", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "objectsize", true, true) {} ceph::io_sequence::tester::SelectBlockSize::SelectBlockSize( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "blocksize", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "blocksize", true, true) {} ceph::io_sequence::tester::SelectNumThreads::SelectNumThreads( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "threads", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "threads", true, true) {} ceph::io_sequence::tester::SelectSeqRange::SelectSeqRange( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "sequence", false, false) -{ + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "sequence", false, false) { if (vm.count(option_name)) { ceph::io_exerciser::Sequence s = - static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>()); + static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>()); if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN || s >= ceph::io_exerciser::Sequence::SEQUENCE_END) { dout(0) << "Sequence argument out of range" << dendl; throw po::validation_error(po::validation_error::invalid_option_value); } ceph::io_exerciser::Sequence e = s; - force_value = std::make_optional<std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence>>( - std::make_pair(s, ++e)); + force_value = std::make_optional< + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>>( + std::make_pair(s, ++e)); } } -const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence> - ceph::io_sequence::tester::SelectSeqRange::choose() { - if (force_value.has_value()) - { +const std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence> +ceph::io_sequence::tester::SelectSeqRange::choose() { + if (force_value.has_value()) { return *force_value; } else { return std::make_pair(ceph::io_exerciser::Sequence::SEQUENCE_BEGIN, @@ -303,45 +298,34 @@ const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence> } } - - ceph::io_sequence::tester::SelectErasureKM::SelectErasureKM( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "km", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "km", true, true) {} ceph::io_sequence::tester::SelectErasurePlugin::SelectErasurePlugin( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "plugin", true, false) -{ -} - - - -ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm) - : ProgramOptionSelector(rng, vm, "stripe_unit", true, false) -{ -} - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "plugin", true, false) {} +ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize( + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "chunksize", true, true) {} ceph::io_sequence::tester::SelectECPool::SelectECPool( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - librados::Rados& rados, - bool dry_run) - : ProgramOptionSelector(rng, vm, "pool", false, false), - rados(rados), - dry_run(dry_run), - skm(SelectErasureKM(rng, vm)), - spl(SelectErasurePlugin(rng, vm)), - scs(SelectErasureChunkSize(rng, vm)) -{ + ceph::util::random_number_generator<int>& rng, po::variables_map vm, + librados::Rados& rados, bool dry_run, bool allow_pool_autoscaling, + bool allow_pool_balancer, bool allow_pool_deep_scrubbing, + bool allow_pool_scrubbing, bool test_recovery) + : ProgramOptionSelector(rng, vm, "pool", false, false), + rados(rados), + dry_run(dry_run), + allow_pool_autoscaling(allow_pool_autoscaling), + allow_pool_balancer(allow_pool_balancer), + allow_pool_deep_scrubbing(allow_pool_deep_scrubbing), + allow_pool_scrubbing(allow_pool_scrubbing), + test_recovery(test_recovery), + skm(SelectErasureKM(rng, vm)), + spl(SelectErasurePlugin(rng, vm)), + scs(SelectErasureChunkSize(rng, vm)) { if (!skm.isForced()) { if (vm.count("pool")) { force_value = vm["pool"].as<std::string>(); @@ -349,147 +333,239 @@ ceph::io_sequence::tester::SelectECPool::SelectECPool( } } -const std::string ceph::io_sequence::tester::SelectECPool::choose() -{ - std::pair<int,int> value; +const std::string ceph::io_sequence::tester::SelectECPool::choose() { + std::pair<int, int> value; if (!skm.isForced() && force_value.has_value()) { + int rc; + bufferlist inbl, outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + ceph::messaging::osd::OSDPoolGetRequest osdPoolGetRequest{*force_value}; + rc = send_mon_command(osdPoolGetRequest, rados, "OSDPoolGetRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDPoolGetReply osdPoolGetReply; + osdPoolGetReply.decode_json(&p); + + ceph::messaging::osd::OSDECProfileGetRequest osdECProfileGetRequest{ + osdPoolGetReply.erasure_code_profile}; + rc = send_mon_command(osdECProfileGetRequest, rados, + "OSDECProfileGetRequest", inbl, &outbl, + formatter.get()); + ceph_assert(rc == 0); + + success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDECProfileGetReply reply; + reply.decode_json(&p); + k = reply.k; + m = reply.m; return *force_value; } else { value = skm.choose(); } - int k = value.first; - int m = value.second; + k = value.first; + m = value.second; const std::string plugin = std::string(spl.choose()); const uint64_t chunk_size = scs.choose(); - std::string pool_name = "ec_" + plugin + - "_cs" + std::to_string(chunk_size) + - "_k" + std::to_string(k) + - "_m" + std::to_string(m); - if (!dry_run) - { + std::string pool_name = "ec_" + plugin + "_cs" + std::to_string(chunk_size) + + "_k" + std::to_string(k) + "_m" + std::to_string(m); + if (!dry_run) { create_pool(rados, pool_name, plugin, chunk_size, k, m); } return pool_name; } void ceph::io_sequence::tester::SelectECPool::create_pool( - librados::Rados& rados, - const std::string& pool_name, - const std::string& plugin, - uint64_t chunk_size, - int k, int m) -{ + librados::Rados& rados, const std::string& pool_name, + const std::string& plugin, uint64_t chunk_size, int k, int m) { int rc; bufferlist inbl, outbl; - std::string profile_create = - "{\"prefix\": \"osd erasure-code-profile set\", \ - \"name\": \"testprofile-" + pool_name + "\", \ - \"profile\": [ \"plugin=" + plugin + "\", \ - \"k=" + std::to_string(k) + "\", \ - \"m=" + std::to_string(m) + "\", \ - \"stripe_unit=" + std::to_string(chunk_size) + "\", \ - \"crush-failure-domain=osd\"]}"; - rc = rados.mon_command(profile_create, inbl, &outbl, nullptr); + auto formatter = std::make_unique<JSONFormatter>(false); + + ceph::messaging::osd::OSDECProfileSetRequest ecProfileSetRequest{ + fmt::format("testprofile-{}", pool_name), + {fmt::format("plugin={}", plugin), fmt::format("k={}", k), + fmt::format("m={}", m), fmt::format("stripe_unit={}", chunk_size), + fmt::format("crush-failure-domain=osd")}}; + rc = send_mon_command(ecProfileSetRequest, rados, "OSDECProfileSetRequest", + inbl, &outbl, formatter.get()); ceph_assert(rc == 0); - std::string cmdstr = - "{\"prefix\": \"osd pool create\", \ - \"pool\": \"" + pool_name + "\", \ - \"pool_type\": \"erasure\", \ - \"pg_num\": 8, \ - \"pgp_num\": 8, \ - \"erasure_code_profile\": \"testprofile-" + pool_name + "\"}"; - rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr); + + ceph::messaging::osd::OSDECPoolCreateRequest poolCreateRequest{ + pool_name, "erasure", 8, 8, fmt::format("testprofile-{}", pool_name)}; + rc = send_mon_command(poolCreateRequest, rados, "OSDECPoolCreateRequest", + inbl, &outbl, formatter.get()); ceph_assert(rc == 0); -} + if (allow_pool_autoscaling) { + ceph::messaging::osd::OSDSetRequest setNoAutoscaleRequest{"noautoscale", + std::nullopt}; + rc = send_mon_command(setNoAutoscaleRequest, rados, "OSDSetRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + } + + if (allow_pool_balancer) { + ceph::messaging::balancer::BalancerOffRequest balancerOffRequest{}; + rc = send_mon_command(balancerOffRequest, rados, "BalancerOffRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + ceph::messaging::balancer::BalancerStatusRequest balancerStatusRequest{}; + rc = send_mon_command(balancerStatusRequest, rados, "BalancerStatusRequest", + inbl, &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::balancer::BalancerStatusReply reply; + reply.decode_json(&p); + ceph_assert(!reply.active); + } + if (allow_pool_deep_scrubbing) { + ceph::messaging::osd::OSDSetRequest setNoDeepScrubRequest{"nodeep-scrub", + std::nullopt}; + rc = send_mon_command(setNoDeepScrubRequest, rados, "setNoDeepScrubRequest", + inbl, &outbl, formatter.get()); + ceph_assert(rc == 0); + } + + if (allow_pool_scrubbing) { + ceph::messaging::osd::OSDSetRequest setNoScrubRequest{"noscrub", + std::nullopt}; + rc = send_mon_command(setNoScrubRequest, rados, "OSDSetRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + } + + if (test_recovery) { + ceph::messaging::config::ConfigSetRequest configSetBluestoreDebugRequest{ + "global", "bluestore_debug_inject_read_err", "true", std::nullopt}; + rc = send_mon_command(configSetBluestoreDebugRequest, rados, + "ConfigSetRequest", inbl, &outbl, + formatter.get()); + ceph_assert(rc == 0); + + ceph::messaging::config::ConfigSetRequest configSetMaxMarkdownRequest{ + "global", "osd_max_markdown_count", "99999999", std::nullopt}; + rc = + send_mon_command(configSetMaxMarkdownRequest, rados, "ConfigSetRequest", + inbl, &outbl, formatter.get()); + ceph_assert(rc == 0); + } +} -ceph::io_sequence::tester::TestObject::TestObject( const std::string oid, - librados::Rados& rados, - boost::asio::io_context& asio, - SelectBlockSize& sbs, - SelectECPool& spo, - SelectObjectSize& sos, - SelectNumThreads& snt, - SelectSeqRange& ssr, - ceph::util::random_number_generator<int>& rng, - ceph::mutex& lock, - ceph::condition_variable& cond, - bool dryrun, - bool verbose, - std::optional<int> seqseed) : - rng(rng), verbose(verbose), seqseed(seqseed) -{ +ceph::io_sequence::tester::TestObject::TestObject( + const std::string oid, librados::Rados& rados, + boost::asio::io_context& asio, SelectBlockSize& sbs, SelectECPool& spo, + SelectObjectSize& sos, SelectNumThreads& snt, SelectSeqRange& ssr, + ceph::util::random_number_generator<int>& rng, ceph::mutex& lock, + ceph::condition_variable& cond, bool dryrun, bool verbose, + std::optional<int> seqseed, bool testrecovery) + : rng(rng), verbose(verbose), seqseed(seqseed), testrecovery(testrecovery) { if (dryrun) { - verbose = true; - exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(oid, - sbs.choose(), - rng()); + exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>( + oid, sbs.choose(), rng()); } else { const std::string pool = spo.choose(); + poolK = spo.getChosenK(); + poolM = spo.getChosenM(); + int threads = snt.choose(); - exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, - asio, - pool, - oid, - sbs.choose(), - rng(), - threads, - lock, - cond); - dout(0) << "= " << oid << " pool=" << pool - << " threads=" << threads - << " blocksize=" << exerciser_model->get_block_size() - << " =" << dendl; + + bufferlist inbl, outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + std::optional<std::vector<int>> cached_shard_order = std::nullopt; + + if (!spo.get_allow_pool_autoscaling() && !spo.get_allow_pool_balancer() && + !spo.get_allow_pool_deep_scrubbing() && + !spo.get_allow_pool_scrubbing()) { + ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, oid, ""}; + int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDMapReply reply{}; + reply.decode_json(&p); + cached_shard_order = reply.acting; + } + + exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>( + rados, asio, pool, oid, cached_shard_order, sbs.choose(), rng(), + threads, lock, cond); + dout(0) << "= " << oid << " pool=" << pool << " threads=" << threads + << " blocksize=" << exerciser_model->get_block_size() << " =" + << dendl; } obj_size_range = sos.choose(); seq_range = ssr.choose(); curseq = seq_range.first; - seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq, - obj_size_range, - seqseed.value_or(rng())); + + if (testrecovery) { + seq = ceph::io_exerciser::EcIoSequence::generate_sequence( + curseq, obj_size_range, poolK, poolM, seqseed.value_or(rng())); + } else { + seq = ceph::io_exerciser::IoSequence::generate_sequence( + curseq, obj_size_range, seqseed.value_or(rng())); + } + op = seq->next(); done = false; - dout(0) << "== " << exerciser_model->get_oid() << " " - << curseq << " " - << seq->get_name() - << " ==" <<dendl; + dout(0) << "== " << exerciser_model->get_oid() << " " << curseq << " " + << seq->get_name_with_seqseed() << " ==" << dendl; } -bool ceph::io_sequence::tester::TestObject::readyForIo() -{ +bool ceph::io_sequence::tester::TestObject::readyForIo() { return exerciser_model->readyForIoOp(*op); } -bool ceph::io_sequence::tester::TestObject::next() -{ +bool ceph::io_sequence::tester::TestObject::next() { if (!done) { if (verbose) { - dout(0) << exerciser_model->get_oid() - << " Step " << seq->get_step() << ": " - << op->to_string(exerciser_model->get_block_size()) << dendl; + dout(0) << exerciser_model->get_oid() << " Step " << seq->get_step() + << ": " << op->to_string(exerciser_model->get_block_size()) + << dendl; } else { - dout(5) << exerciser_model->get_oid() - << " Step " << seq->get_step() << ": " - << op->to_string(exerciser_model->get_block_size()) << dendl; + dout(5) << exerciser_model->get_oid() << " Step " << seq->get_step() + << ": " << op->to_string(exerciser_model->get_block_size()) + << dendl; } exerciser_model->applyIoOp(*op); - if (op->done()) { - ++curseq; - if (curseq == seq_range.second) { + if (op->getOpType() == ceph::io_exerciser::OpType::Done) { + curseq = seq->getNextSupportedSequenceId(); + if (curseq >= seq_range.second) { done = true; dout(0) << exerciser_model->get_oid() << " Number of IOs = " << exerciser_model->get_num_io() << dendl; } else { - seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq, - obj_size_range, - seqseed.value_or(rng())); - dout(0) << "== " << exerciser_model->get_oid() << " " - << curseq << " " << seq->get_name() - << " ==" <<dendl; + if (testrecovery) { + seq = ceph::io_exerciser::EcIoSequence::generate_sequence( + curseq, obj_size_range, poolK, poolM, seqseed.value_or(rng())); + } else { + seq = ceph::io_exerciser::IoSequence::generate_sequence( + curseq, obj_size_range, seqseed.value_or(rng())); + } + + dout(0) << "== " << exerciser_model->get_oid() << " " << curseq << " " + << seq->get_name_with_seqseed() << " ==" << dendl; op = seq->next(); } } else { @@ -499,27 +575,30 @@ bool ceph::io_sequence::tester::TestObject::next() return done; } -bool ceph::io_sequence::tester::TestObject::finished() -{ - return done; -} +bool ceph::io_sequence::tester::TestObject::finished() { return done; } -int ceph::io_sequence::tester::TestObject::get_num_io() -{ +int ceph::io_sequence::tester::TestObject::get_num_io() { return exerciser_model->get_num_io(); } ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm, - librados::Rados& rados) : - rados(rados), - seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)), - rng(ceph::util::random_number_generator<int>(seed)), - sbs{rng, vm}, - sos{rng, vm}, - spo{rng, vm, rados, vm.contains("dryrun")}, - snt{rng, vm}, - ssr{rng, vm} -{ + librados::Rados& rados) + : rados(rados), + seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)), + rng(ceph::util::random_number_generator<int>(seed)), + sbs{rng, vm}, + sos{rng, vm}, + spo{rng, + vm, + rados, + vm.contains("dryrun"), + vm.contains("allow_pool_autoscaling"), + vm.contains("allow_pool_balancer"), + vm.contains("allow_pool_deep_scrubbing"), + vm.contains("allow_pool_scrubbing"), + vm.contains("test_recovery")}, + snt{rng, vm}, + ssr{rng, vm} { dout(0) << "Test using seed " << seed << dendl; verbose = vm.contains("verbose"); @@ -532,19 +611,23 @@ ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm, num_objects = vm["parallel"].as<int>(); object_name = vm["object"].as<std::string>(); interactive = vm.contains("interactive"); + testrecovery = vm.contains("testrecovery"); + + allow_pool_autoscaling = vm.contains("allow_pool_autoscaling"); + allow_pool_balancer = vm.contains("allow_pool_balancer"); + allow_pool_deep_scrubbing = vm.contains("allow_pool_deep_scrubbing"); + allow_pool_scrubbing = vm.contains("allow_pool_scrubbing"); - if (!dryrun) - { + if (!dryrun) { guard.emplace(boost::asio::make_work_guard(asio)); - thread = make_named_thread("io_thread",[&asio = asio] { asio.run(); }); + thread = make_named_thread("io_thread", [&asio = asio] { asio.run(); }); } show_help = vm.contains("help"); show_sequence = vm.contains("listsequence"); } -ceph::io_sequence::tester::TestRunner::~TestRunner() -{ +ceph::io_sequence::tester::TestRunner::~TestRunner() { if (!dryrun) { guard = std::nullopt; asio.stop(); @@ -553,34 +636,38 @@ ceph::io_sequence::tester::TestRunner::~TestRunner() } } -void ceph::io_sequence::tester::TestRunner::help() -{ +void ceph::io_sequence::tester::TestRunner::help() { std::cout << get_options_description() << std::endl; for (auto line : usage) { std::cout << line << std::endl; } } -void ceph::io_sequence::tester::TestRunner::list_sequence() -{ +void ceph::io_sequence::tester::TestRunner::list_sequence(bool testrecovery) { // List seqeunces - std::pair<int,int> obj_size_range = sos.choose(); - for (ceph::io_exerciser::Sequence s - = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN; - s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) { - std::unique_ptr<ceph::io_exerciser::IoSequence> seq = - ceph::io_exerciser::IoSequence::generate_sequence(s, - obj_size_range, - seqseed.value_or(rng())); - dout(0) << s << " " << seq->get_name() << dendl; + std::pair<int, int> obj_size_range = sos.choose(); + ceph::io_exerciser::Sequence s = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN; + std::unique_ptr<ceph::io_exerciser::IoSequence> seq; + if (testrecovery) { + seq = ceph::io_exerciser::EcIoSequence::generate_sequence( + s, obj_size_range, spo.getChosenK(), spo.getChosenM(), + seqseed.value_or(rng())); + } else { + seq = ceph::io_exerciser::IoSequence::generate_sequence( + s, obj_size_range, seqseed.value_or(rng())); } + + do { + dout(0) << s << " " << seq->get_name_with_seqseed() << dendl; + s = seq->getNextSupportedSequenceId(); + } while (s != ceph::io_exerciser::Sequence::SEQUENCE_END); } -std::string ceph::io_sequence::tester::TestRunner::get_token() -{ - static std::string line; - static ceph::split split = ceph::split(""); - static ceph::spliterator tokens; +void ceph::io_sequence::tester::TestRunner::clear_tokens() { + tokens = split.end(); +} + +std::string ceph::io_sequence::tester::TestRunner::get_token() { while (line.empty() || tokens == split.end()) { if (!std::getline(std::cin, line)) { throw std::runtime_error("End of input"); @@ -591,127 +678,211 @@ std::string ceph::io_sequence::tester::TestRunner::get_token() return std::string(*tokens++); } -uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token() -{ +std::optional<std::string> +ceph::io_sequence::tester::TestRunner ::get_optional_token() { + std::optional<std::string> ret = std::nullopt; + if (tokens != split.end()) { + ret = std::string(*tokens++); + } + return ret; +} + +uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token() { std::string parse_error; std::string token = get_token(); uint64_t num = strict_iecstrtoll(token, &parse_error); if (!parse_error.empty()) { - throw std::runtime_error("Invalid number "+token); + throw std::runtime_error("Invalid number " + token); } return num; } -bool ceph::io_sequence::tester::TestRunner::run_test() -{ - if (show_help) - { +std::optional<uint64_t> +ceph::io_sequence::tester::TestRunner ::get_optional_numeric_token() { + std::string parse_error; + std::optional<std::string> token = get_optional_token(); + if (token) { + uint64_t num = strict_iecstrtoll(*token, &parse_error); + if (!parse_error.empty()) { + throw std::runtime_error("Invalid number " + *token); + } + return num; + } + + return std::optional<uint64_t>(std::nullopt); +} + +bool ceph::io_sequence::tester::TestRunner::run_test() { + if (show_help) { help(); return true; - } - else if (show_sequence) - { - list_sequence(); + } else if (show_sequence) { + list_sequence(testrecovery); return true; - } - else if (interactive) - { + } else if (interactive) { return run_interactive_test(); - } - else - { + } else { return run_automated_test(); } } -bool ceph::io_sequence::tester::TestRunner::run_interactive_test() -{ +bool ceph::io_sequence::tester::TestRunner::run_interactive_test() { bool done = false; std::unique_ptr<ceph::io_exerciser::IoOp> ioop; std::unique_ptr<ceph::io_exerciser::Model> model; if (dryrun) { - model = std::make_unique<ceph::io_exerciser::ObjectModel>(object_name, - sbs.choose(), - rng()); + model = std::make_unique<ceph::io_exerciser::ObjectModel>( + object_name, sbs.choose(), rng()); } else { const std::string pool = spo.choose(); - model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, asio, pool, - object_name, sbs.choose(), - rng(), 1, // 1 thread - lock, cond); + + bufferlist inbl, outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, object_name, ""}; + int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDMapReply reply{}; + reply.decode_json(&p); + + model = std::make_unique<ceph::io_exerciser::RadosIo>( + rados, asio, pool, object_name, reply.acting, sbs.choose(), rng(), + 1, // 1 thread + lock, cond); } while (!done) { const std::string op = get_token(); - if (!op.compare("done") || !op.compare("q") || !op.compare("quit")) { - ioop = ceph::io_exerciser::IoOp::generate_done(); - } else if (!op.compare("create")) { - ioop = ceph::io_exerciser::IoOp::generate_create(get_numeric_token()); - } else if (!op.compare("remove") || !op.compare("delete")) { - ioop = ceph::io_exerciser::IoOp::generate_remove(); - } else if (!op.compare("read")) { + if (op == "done" || op == "q" || op == "quit") { + ioop = ceph::io_exerciser::DoneOp::generate(); + } else if (op == "create") { + ioop = ceph::io_exerciser::CreateOp::generate(get_numeric_token()); + } else if (op == "remove" || op == "delete") { + ioop = ceph::io_exerciser::RemoveOp::generate(); + } else if (op == "read") { uint64_t offset = get_numeric_token(); uint64_t length = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_read(offset, length); - } else if (!op.compare("read2")) { + ioop = ceph::io_exerciser::SingleReadOp::generate(offset, length); + } else if (op == "read2") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_read2(offset1, length1, - offset2, length2); - } else if (!op.compare("read3")) { + ioop = DoubleReadOp::generate(offset1, length1, offset2, length2); + } else if (op == "read3") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); uint64_t offset3 = get_numeric_token(); uint64_t length3 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_read3(offset1, length1, - offset2, length2, - offset3, length3); - } else if (!op.compare("write")) { + ioop = TripleReadOp::generate(offset1, length1, offset2, length2, offset3, + length3); + } else if (op == "write") { uint64_t offset = get_numeric_token(); uint64_t length = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_write(offset, length); - } else if (!op.compare("write2")) { + ioop = SingleWriteOp::generate(offset, length); + } else if (op == "write2") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_write2(offset1, length1, - offset2, length2); - } else if (!op.compare("write3")) { + ioop = DoubleWriteOp::generate(offset1, length1, offset2, length2); + } else if (op == "write3") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); uint64_t offset3 = get_numeric_token(); uint64_t length3 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_write3(offset1, length1, - offset2, length2, - offset3, length3); + ioop = TripleWriteOp::generate(offset1, length1, offset2, length2, + offset3, length3); + } else if (op == "failedwrite") { + uint64_t offset = get_numeric_token(); + uint64_t length = get_numeric_token(); + ioop = SingleFailedWriteOp::generate(offset, length); + } else if (op == "failedwrite2") { + uint64_t offset1 = get_numeric_token(); + uint64_t length1 = get_numeric_token(); + uint64_t offset2 = get_numeric_token(); + uint64_t length2 = get_numeric_token(); + ioop = DoubleFailedWriteOp::generate(offset1, length1, offset2, length2); + } else if (op == "failedwrite3") { + uint64_t offset1 = get_numeric_token(); + uint64_t length1 = get_numeric_token(); + uint64_t offset2 = get_numeric_token(); + uint64_t length2 = get_numeric_token(); + uint64_t offset3 = get_numeric_token(); + uint64_t length3 = get_numeric_token(); + ioop = TripleFailedWriteOp::generate(offset1, length1, offset2, length2, + offset3, length3); + } else if (op == "injecterror") { + std::string inject_type = get_token(); + int shard = get_numeric_token(); + std::optional<int> type = get_optional_numeric_token(); + std::optional<int> when = get_optional_numeric_token(); + std::optional<int> duration = get_optional_numeric_token(); + if (inject_type == "read") { + ioop = ceph::io_exerciser::InjectReadErrorOp::generate(shard, type, + when, duration); + } else if (inject_type == "write") { + ioop = ceph::io_exerciser::InjectWriteErrorOp::generate(shard, type, + when, duration); + } else { + clear_tokens(); + ioop.reset(); + dout(0) << fmt::format("Invalid error inject {}. No action performed.", + inject_type) + << dendl; + } + } else if (op == "clearinject") { + std::string inject_type = get_token(); + int shard = get_numeric_token(); + std::optional<int> type = get_optional_numeric_token(); + if (inject_type == "read") { + ioop = + ceph::io_exerciser::ClearReadErrorInjectOp::generate(shard, type); + } else if (inject_type == "write") { + ioop = + ceph::io_exerciser::ClearWriteErrorInjectOp::generate(shard, type); + } else { + clear_tokens(); + ioop.reset(); + dout(0) << fmt::format("Invalid error inject {}. No action performed.", + inject_type) + << dendl; + } } else { - throw std::runtime_error("Invalid operation "+op); + clear_tokens(); + ioop.reset(); + dout(0) << fmt::format("Invalid op {}. No action performed.", op) + << dendl; } - dout(0) << ioop->to_string(model->get_block_size()) << dendl; - model->applyIoOp(*ioop); - done = ioop->done(); - if (!done) { - ioop = ceph::io_exerciser::IoOp::generate_barrier(); + if (ioop) { + dout(0) << ioop->to_string(model->get_block_size()) << dendl; model->applyIoOp(*ioop); + done = ioop->getOpType() == ceph::io_exerciser::OpType::Done; + if (!done) { + ioop = ceph::io_exerciser::BarrierOp::generate(); + model->applyIoOp(*ioop); + } } } return true; } -bool ceph::io_sequence::tester::TestRunner::run_automated_test() -{ +bool ceph::io_sequence::tester::TestRunner::run_automated_test() { // Create a test for each object - std::vector<std::shared_ptr< - ceph::io_sequence::tester::TestObject>> test_objects; + std::vector<std::shared_ptr<ceph::io_sequence::tester::TestObject>> + test_objects; for (int obj = 0; obj < num_objects; obj++) { std::string name; @@ -721,15 +892,9 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() name = object_name + std::to_string(obj); } test_objects.push_back( - std::make_shared<ceph::io_sequence::tester::TestObject>( - name, - rados, asio, - sbs, spo, sos, snt, ssr, - rng, lock, cond, - dryrun, verbose, - seqseed - ) - ); + std::make_shared<ceph::io_sequence::tester::TestObject>( + name, rados, asio, sbs, spo, sos, snt, ssr, rng, lock, cond, dryrun, + verbose, seqseed, testrecovery)); } if (!dryrun) { rados.wait_for_latest_osdmap(); @@ -748,16 +913,15 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) { std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj; if (!to->finished()) { - lock.lock(); - bool ready = to->readyForIo(); - lock.unlock(); - if (ready) - { - to->next(); - started_io = true; - } else { - need_wait = true; - } + lock.lock(); + bool ready = to->readyForIo(); + lock.unlock(); + if (ready) { + to->next(); + started_io = true; + } else { + need_wait = true; + } } } if (!started_io && need_wait) { @@ -767,8 +931,7 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj; if (!to->finished()) { need_wait = !to->readyForIo(); - if (!need_wait) - { + if (!need_wait) { break; } } @@ -788,18 +951,16 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() return true; } -int main(int argc, char **argv) -{ +int main(int argc, char** argv) { auto args = argv_to_vec(argc, argv); env_to_vec(args); auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, - CODE_ENVIRONMENT_UTILITY, 0); + CODE_ENVIRONMENT_UTILITY, 0); common_init_finish(cct.get()); po::variables_map vm; int rc = parse_io_seq_options(vm, argc, argv); - if (rc != 0) - { + if (rc != 0) { return rc; } @@ -814,7 +975,7 @@ int main(int argc, char **argv) std::unique_ptr<ceph::io_sequence::tester::TestRunner> runner; try { runner = std::make_unique<ceph::io_sequence::tester::TestRunner>(vm, rados); - } catch(const po::error& e) { + } catch (const po::error& e) { return 1; } runner->run_test(); diff --git a/src/test/osd/ceph_test_rados_io_sequence.h b/src/test/osd/ceph_test_rados_io_sequence.h index 4e21d025700..9af5f706b2f 100644 --- a/src/test/osd/ceph_test_rados_io_sequence.h +++ b/src/test/osd/ceph_test_rados_io_sequence.h @@ -1,34 +1,36 @@ +#include <boost/program_options.hpp> +#include <optional> #include <utility> -#include "include/random.h" - -#include "global/global_init.h" -#include "global/global_context.h" - #include "common/io_exerciser/IoOp.h" #include "common/io_exerciser/IoSequence.h" #include "common/io_exerciser/Model.h" - +#include "common/split.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/random.h" #include "librados/librados_asio.h" #include <boost/asio/io_context.hpp> #include <boost/program_options.hpp> +#include <optional> + /* Overview * * class ProgramOptionSelector - * Base class for selector objects below with common code for + * Base class for selector objects below with common code for * selecting options - * + * * class SelectObjectSize * Selects min and max object sizes for a test * * class SelectErasureKM * Selects an EC k and m value for a test - * + * * class SelectErasurePlugin * Selects an plugin for a test - * + * * class SelectECPool * Selects an EC pool (plugin,k and m) for a test. Also creates the * pool as well. @@ -58,287 +60,279 @@ namespace po = boost::program_options; -namespace ceph -{ - namespace io_sequence::tester - { - // Choices for min and max object size - inline constexpr size_t objectSizeSize = 10; - inline constexpr std::array<std::pair<int,int>,objectSizeSize> - objectSizeChoices = {{ - {1,32}, // Default - best for boundary checking - {12,14}, - {28,30}, - {36,38}, - {42,44}, - {52,54}, - {66,68}, - {72,74}, - {83,83}, - {97,97} - }}; - - // Choices for block size - inline constexpr int blockSizeSize = 5; - inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = {{ - 2048, // Default - test boundaries for EC 4K chunk size - 512, - 3767, - 4096, - 32768 - }}; - - // Choices for number of threads - inline constexpr int threadArraySize = 4; - inline constexpr std::array<int, threadArraySize> threadCountChoices = {{ - 1, // Default - 2, - 4, - 8 - }}; - - // Choices for EC k+m profile - inline constexpr int kmSize = 6; - inline constexpr std::array<std::pair<int,int>, kmSize> kmChoices = {{ - {2,2}, // Default - reasonable coverage - {2,1}, - {2,3}, - {3,2}, - {4,2}, - {5,1} - }}; - - // Choices for EC chunk size - inline constexpr int chunkSizeSize = 3; - inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = {{ - 4*1024, - 64*1024, - 256*1024 - }}; - - // Choices for plugin - inline constexpr int pluginListSize = 2; - inline constexpr std::array<std::string_view, - pluginListSize> pluginChoices = {{ - "jerasure", - "isa" - }}; - - inline constexpr std::array<std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence>, - 0> sequencePairs = {{}}; - - inline constexpr std::array<std::string, 0> poolChoices = {{}}; - - template <typename T, int N, const std::array<T, N>& Ts> - class ProgramOptionSelector - { - public: - ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - const std::string& option_name, - bool set_forced, - bool select_first - ); - virtual ~ProgramOptionSelector() = default; - bool isForced(); - virtual const T choose(); - - protected: - ceph::util::random_number_generator<int>& rng; - static constexpr std::array<T, N> choices = Ts; - - std::optional<T> force_value; - std::optional<T> first_value; - - std::string option_name; - }; - - class SelectObjectSize - : public ProgramOptionSelector<std::pair<int, int>, - io_sequence::tester::objectSizeSize, - io_sequence::tester::objectSizeChoices> - { - public: - SelectObjectSize(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectBlockSize - : public ProgramOptionSelector<uint64_t, - io_sequence::tester::blockSizeSize, - io_sequence::tester::blockSizeChoices> - { - public: - SelectBlockSize(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectNumThreads - : public ProgramOptionSelector<int, - io_sequence::tester::threadArraySize, - io_sequence::tester::threadCountChoices> - { - public: - SelectNumThreads(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectSeqRange - : public ProgramOptionSelector<std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence>, - 0, io_sequence::tester::sequencePairs> - { - public: - SelectSeqRange(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - - const std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence> choose() override; - }; - - class SelectErasureKM - : public ProgramOptionSelector<std::pair<int,int>, - io_sequence::tester::kmSize, - io_sequence::tester::kmChoices> - { - public: - SelectErasureKM(ceph::util::random_number_generator<int>& rng, +namespace ceph { +namespace io_sequence::tester { +// Choices for min and max object size +inline constexpr size_t objectSizeSize = 10; +inline constexpr std::array<std::pair<int, int>, objectSizeSize> + objectSizeChoices = {{{1, 32}, // Default - best for boundary checking + {12, 14}, + {28, 30}, + {36, 38}, + {42, 44}, + {52, 54}, + {66, 68}, + {72, 74}, + {83, 83}, + {97, 97}}}; + +// Choices for block size +inline constexpr int blockSizeSize = 5; +inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = { + {2048, // Default - test boundaries for EC 4K chunk size + 512, 3767, 4096, 32768}}; + +// Choices for number of threads +inline constexpr int threadArraySize = 4; +inline constexpr std::array<int, threadArraySize> threadCountChoices = { + {1, // Default + 2, 4, 8}}; + +// Choices for EC k+m profile +inline constexpr int kmSize = 6; +inline constexpr std::array<std::pair<int, int>, kmSize> kmChoices = { + {{2, 2}, // Default - reasonable coverage + {2, 1}, + {2, 3}, + {3, 2}, + {4, 2}, + {5, 1}}}; + +// Choices for EC chunk size +inline constexpr int chunkSizeSize = 3; +inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = { + {4 * 1024, 64 * 1024, 256 * 1024}}; + +// Choices for plugin +inline constexpr int pluginListSize = 2; +inline constexpr std::array<std::string_view, pluginListSize> pluginChoices = { + {"jerasure", "isa"}}; + +inline constexpr std::array< + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>, 0> + sequencePairs = {{}}; + +inline constexpr std::array<std::string, 0> poolChoices = {{}}; + +template <typename T, int N, const std::array<T, N>& Ts> +class ProgramOptionSelector { + public: + ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, + po::variables_map vm, const std::string& option_name, + bool set_forced, bool select_first); + virtual ~ProgramOptionSelector() = default; + bool isForced(); + virtual const T choose(); + + protected: + ceph::util::random_number_generator<int>& rng; + static constexpr std::array<T, N> choices = Ts; + + std::optional<T> force_value; + std::optional<T> first_value; + + std::string option_name; +}; + +class SelectObjectSize + : public ProgramOptionSelector<std::pair<int, int>, + io_sequence::tester::objectSizeSize, + io_sequence::tester::objectSizeChoices> { + public: + SelectObjectSize(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectBlockSize + : public ProgramOptionSelector<uint64_t, io_sequence::tester::blockSizeSize, + io_sequence::tester::blockSizeChoices> { + public: + SelectBlockSize(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectNumThreads + : public ProgramOptionSelector<int, io_sequence::tester::threadArraySize, + io_sequence::tester::threadCountChoices> { + public: + SelectNumThreads(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectSeqRange + : public ProgramOptionSelector< + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>, + 0, io_sequence::tester::sequencePairs> { + public: + SelectSeqRange(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); + + const std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence> + choose() override; +}; + +class SelectErasureKM + : public ProgramOptionSelector<std::pair<int, int>, + io_sequence::tester::kmSize, + io_sequence::tester::kmChoices> { + public: + SelectErasureKM(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectErasurePlugin + : public ProgramOptionSelector<std::string_view, + io_sequence::tester::pluginListSize, + io_sequence::tester::pluginChoices> { + public: + SelectErasurePlugin(ceph::util::random_number_generator<int>& rng, po::variables_map vm); - }; - - class SelectErasurePlugin - : public ProgramOptionSelector<std::string_view, - io_sequence::tester::pluginListSize, - io_sequence::tester::pluginChoices> - { - public: - SelectErasurePlugin(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectErasureChunkSize - : public ProgramOptionSelector<uint64_t, - io_sequence::tester::chunkSizeSize, - io_sequence::tester::chunkSizeChoices> - { - public: - SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm); - }; - - class SelectECPool - : public ProgramOptionSelector<std::string, - 0, - io_sequence::tester::poolChoices> - { - public: - SelectECPool(ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - librados::Rados& rados, - bool dry_run); - const std::string choose() override; - - private: - void create_pool(librados::Rados& rados, - const std::string& pool_name, - const std::string& plugin, - uint64_t chunk_size, - int k, int m); - - protected: - librados::Rados& rados; - bool dry_run; - - SelectErasureKM skm; - SelectErasurePlugin spl; - SelectErasureChunkSize scs; - }; - - class TestObject - { - public: - TestObject( const std::string oid, - librados::Rados& rados, - boost::asio::io_context& asio, - ceph::io_sequence::tester::SelectBlockSize& sbs, - ceph::io_sequence::tester::SelectECPool& spl, - ceph::io_sequence::tester::SelectObjectSize& sos, - ceph::io_sequence::tester::SelectNumThreads& snt, - ceph::io_sequence::tester::SelectSeqRange& ssr, - ceph::util::random_number_generator<int>& rng, - ceph::mutex& lock, - ceph::condition_variable& cond, - bool dryrun, - bool verbose, - std::optional<int> seqseed); - - int get_num_io(); - bool readyForIo(); - bool next(); - bool finished(); - - protected: - std::unique_ptr<ceph::io_exerciser::Model> exerciser_model; - std::pair<int,int> obj_size_range; - std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence> seq_range; - ceph::io_exerciser::Sequence curseq; - std::unique_ptr<ceph::io_exerciser::IoSequence> seq; - std::unique_ptr<ceph::io_exerciser::IoOp> op; - bool done; - ceph::util::random_number_generator<int>& rng; - bool verbose; - std::optional<int> seqseed; - }; - - class TestRunner - { - public: - TestRunner(po::variables_map& vm, librados::Rados& rados); - ~TestRunner(); - - bool run_test(); - - private: - librados::Rados& rados; - int seed; - ceph::util::random_number_generator<int> rng; - - ceph::io_sequence::tester::SelectBlockSize sbs; - ceph::io_sequence::tester::SelectObjectSize sos; - ceph::io_sequence::tester::SelectECPool spo; - ceph::io_sequence::tester::SelectNumThreads snt; - ceph::io_sequence::tester::SelectSeqRange ssr; - - boost::asio::io_context asio; - std::thread thread; - std::optional<boost::asio::executor_work_guard< - boost::asio::io_context::executor_type>> guard; - ceph::mutex lock = ceph::make_mutex("RadosIo::lock"); - ceph::condition_variable cond; - - bool input_valid; - - bool verbose; - bool dryrun; - std::optional<int> seqseed; - bool interactive; - - bool show_sequence; - bool show_help; - - int num_objects; - std::string object_name; - - std::string get_token(); - uint64_t get_numeric_token(); - - bool run_automated_test(); - - bool run_interactive_test(); - - void help(); - void list_sequence(); - }; - } -} +}; + +class SelectErasureChunkSize + : public ProgramOptionSelector<uint64_t, io_sequence::tester::chunkSizeSize, + io_sequence::tester::chunkSizeChoices> { + public: + SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectECPool + : public ProgramOptionSelector<std::string, 0, + io_sequence::tester::poolChoices> { + public: + SelectECPool(ceph::util::random_number_generator<int>& rng, + po::variables_map vm, librados::Rados& rados, bool dry_run, + bool allow_pool_autoscaling, bool allow_pool_balancer, + bool allow_pool_deep_scrubbing, bool allow_pool_scrubbing, + bool test_recovery); + const std::string choose() override; + + bool get_allow_pool_autoscaling() { return allow_pool_autoscaling; } + bool get_allow_pool_balancer() { return allow_pool_balancer; } + bool get_allow_pool_deep_scrubbing() { return allow_pool_deep_scrubbing; } + bool get_allow_pool_scrubbing() { return allow_pool_scrubbing; } + int getChosenK() const { return k; } + int getChosenM() const { return m; } + + private: + void create_pool(librados::Rados& rados, const std::string& pool_name, + const std::string& plugin, uint64_t chunk_size, int k, + int m); + + protected: + librados::Rados& rados; + bool dry_run; + bool allow_pool_autoscaling; + bool allow_pool_balancer; + bool allow_pool_deep_scrubbing; + bool allow_pool_scrubbing; + bool test_recovery; + int k; + int m; + + SelectErasureKM skm; + SelectErasurePlugin spl; + SelectErasureChunkSize scs; +}; + +class TestObject { + public: + TestObject(const std::string oid, librados::Rados& rados, + boost::asio::io_context& asio, + ceph::io_sequence::tester::SelectBlockSize& sbs, + ceph::io_sequence::tester::SelectECPool& spl, + ceph::io_sequence::tester::SelectObjectSize& sos, + ceph::io_sequence::tester::SelectNumThreads& snt, + ceph::io_sequence::tester::SelectSeqRange& ssr, + ceph::util::random_number_generator<int>& rng, ceph::mutex& lock, + ceph::condition_variable& cond, bool dryrun, bool verbose, + std::optional<int> seqseed, bool testRecovery); + + int get_num_io(); + bool readyForIo(); + bool next(); + bool finished(); + + protected: + std::unique_ptr<ceph::io_exerciser::Model> exerciser_model; + std::pair<int, int> obj_size_range; + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence> + seq_range; + ceph::io_exerciser::Sequence curseq; + std::unique_ptr<ceph::io_exerciser::IoSequence> seq; + std::unique_ptr<ceph::io_exerciser::IoOp> op; + bool done; + ceph::util::random_number_generator<int>& rng; + bool verbose; + std::optional<int> seqseed; + int poolK; + int poolM; + bool testrecovery; +}; + +class TestRunner { + public: + TestRunner(po::variables_map& vm, librados::Rados& rados); + ~TestRunner(); + + bool run_test(); + + private: + librados::Rados& rados; + int seed; + ceph::util::random_number_generator<int> rng; + + ceph::io_sequence::tester::SelectBlockSize sbs; + ceph::io_sequence::tester::SelectObjectSize sos; + ceph::io_sequence::tester::SelectECPool spo; + ceph::io_sequence::tester::SelectNumThreads snt; + ceph::io_sequence::tester::SelectSeqRange ssr; + + boost::asio::io_context asio; + std::thread thread; + std::optional< + boost::asio::executor_work_guard<boost::asio::io_context::executor_type>> + guard; + ceph::mutex lock = ceph::make_mutex("RadosIo::lock"); + ceph::condition_variable cond; + + bool input_valid; + + bool verbose; + bool dryrun; + std::optional<int> seqseed; + bool interactive; + + bool testrecovery; + + bool allow_pool_autoscaling; + bool allow_pool_balancer; + bool allow_pool_deep_scrubbing; + bool allow_pool_scrubbing; + + bool show_sequence; + bool show_help; + + int num_objects; + std::string object_name; + + std::string line; + ceph::split split = ceph::split(""); + ceph::spliterator tokens; + + void clear_tokens(); + std::string get_token(); + std::optional<std::string> get_optional_token(); + uint64_t get_numeric_token(); + std::optional<uint64_t> get_optional_numeric_token(); + + bool run_automated_test(); + + bool run_interactive_test(); + + void help(); + void list_sequence(bool testrecovery); +}; +} // namespace io_sequence::tester +} // namespace ceph diff --git a/src/test/pybind/pytest.ini b/src/test/pybind/pytest.ini index dccf2a346dc..97569e88299 100644 --- a/src/test/pybind/pytest.ini +++ b/src/test/pybind/pytest.ini @@ -7,3 +7,4 @@ markers = stats tier watch + wait diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py index 3039223abdf..630e6046b24 100755 --- a/src/test/pybind/test_ceph_argparse.py +++ b/src/test/pybind/test_ceph_argparse.py @@ -217,7 +217,7 @@ class TestPG(TestArgparse): def test_pg_missing_args_output(self): ret, _, stderr = self._capture_output(['pg'], stderr=True) self.assertEqual({}, ret) - self.assertRegexpMatches(stderr, re.compile('no valid command found.* closest matches')) + self.assertRegex(stderr, re.compile('no valid command found.* closest matches')) def test_pg_wrong_arg_output(self): ret, _, stderr = self._capture_output(['pg', 'map', 'bad-pgid'], @@ -416,10 +416,10 @@ class TestMDS(TestArgparse): class TestFS(TestArgparse): - + def test_dump(self): self.check_0_or_1_natural_arg('fs', 'dump') - + def test_fs_new(self): self._assert_valid_command(['fs', 'new', 'default', 'metadata', 'data']) @@ -912,7 +912,7 @@ class TestOSD(TestArgparse): '1.2.3.4/567', '600.40']) self._assert_valid_command(['osd', 'blocklist', action, '1.2.3.4', '600.40']) - + self._assert_valid_command(['osd', 'blocklist', action, 'v1:1.2.3.4', '600.40']) self._assert_valid_command(['osd', 'blocklist', action, @@ -925,7 +925,7 @@ class TestOSD(TestArgparse): 'v2:[2607:f298:4:2243::5522]:0/0', '600.40']) self._assert_valid_command(['osd', 'blocklist', action, '[2001:0db8::85a3:0000:8a2e:0370:7334]:0/0', '600.40']) - + self.assertEqual({}, validate_command(sigdict, ['osd', 'blocklist', action, 'invalid', diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py index cb2a4f96101..881b29c9152 100644 --- a/src/test/pybind/test_rados.py +++ b/src/test/pybind/test_rados.py @@ -207,7 +207,7 @@ class TestRados(object): def test_get_fsid(self): fsid = self.rados.get_fsid() - assert re.match('[0-9a-f\-]{36}', fsid, re.I) + assert re.match(r'[0-9a-f\-]{36}', fsid, re.I) def test_blocklist_add(self): self.rados.blocklist_add("1.2.3.4/123", 1) @@ -516,6 +516,11 @@ class TestIoctx(object): eq(self.ioctx.read('write_ops'), b'12\x00\x005') write_op.write_full(b'12345') + write_op.zero(0, 2) + self.ioctx.operate_write_op(write_op, "write_ops") + eq(self.ioctx.read('write_ops'), b'\x00\x00345') + + write_op.write_full(b'12345') write_op.truncate(2) self.ioctx.operate_write_op(write_op, "write_ops") eq(self.ioctx.read('write_ops'), b'12') diff --git a/src/test/rgw/bucket_notification/requirements.txt b/src/test/rgw/bucket_notification/requirements.txt index a3cff2bedab..bb74eceedc3 100644 --- a/src/test/rgw/bucket_notification/requirements.txt +++ b/src/test/rgw/bucket_notification/requirements.txt @@ -1,4 +1,4 @@ -nose >=1.0.0 +nose-py3 >=1.0.0 boto >=2.6.0 boto3 >=1.0.0 configparser >=5.0.0 diff --git a/src/test/rgw/bucket_notification/test_bn.py b/src/test/rgw/bucket_notification/test_bn.py index 90ee33617fe..665fbca7494 100644 --- a/src/test/rgw/bucket_notification/test_bn.py +++ b/src/test/rgw/bucket_notification/test_bn.py @@ -410,17 +410,25 @@ kafka_server = 'localhost' class KafkaReceiver(object): """class for receiving and storing messages on a topic from the kafka broker""" - def __init__(self, topic, security_type): + def __init__(self, topic, security_type, kafka_server='localhost'): from kafka import KafkaConsumer remaining_retries = 10 port = 9092 if security_type != 'PLAINTEXT': security_type = 'SSL' port = 9093 + + if kafka_server is None: + endpoint = "localhost" + ":" + str(port) + elif ":" not in kafka_server: + endpoint = kafka_server + ":" + str(port) + else: + endpoint = kafka_server + while remaining_retries > 0: try: self.consumer = KafkaConsumer(topic, - bootstrap_servers = kafka_server+':'+str(port), + bootstrap_servers=endpoint, security_protocol=security_type, consumer_timeout_ms=16000, auto_offset_reset='earliest') @@ -468,9 +476,9 @@ def kafka_receiver_thread_runner(receiver): print('Kafka receiver ended unexpectedly: ' + str(error)) -def create_kafka_receiver_thread(topic, security_type='PLAINTEXT'): +def create_kafka_receiver_thread(topic, security_type='PLAINTEXT', kafka_brokers=None): """create kafka receiver and thread""" - receiver = KafkaReceiver(topic, security_type) + receiver = KafkaReceiver(topic, security_type, kafka_server=kafka_brokers) task = threading.Thread(target=kafka_receiver_thread_runner, args=(receiver,)) task.daemon = True return task, receiver @@ -1304,7 +1312,7 @@ def test_ps_s3_notification_errors_on_master(): conn.delete_bucket(bucket_name) -def notification_push(endpoint_type, conn, account=None, cloudevents=False): +def notification_push(endpoint_type, conn, account=None, cloudevents=False, kafka_brokers=None): """ test pushinging notification """ zonegroup = get_config_zonegroup() # create bucket @@ -1359,11 +1367,13 @@ def notification_push(endpoint_type, conn, account=None, cloudevents=False): assert_equal(status/100, 2) elif endpoint_type == 'kafka': # start amqp receiver - task, receiver = create_kafka_receiver_thread(topic_name) + task, receiver = create_kafka_receiver_thread(topic_name, kafka_brokers=kafka_brokers) task.start() endpoint_address = 'kafka://' + kafka_server # without acks from broker endpoint_args = 'push-endpoint='+endpoint_address+'&kafka-ack-level=broker' + if kafka_brokers is not None: + endpoint_args += '&kafka-brokers=' + kafka_brokers # create s3 topic topic_conf = PSTopicS3(conn, topic_name, zonegroup, endpoint_args=endpoint_args) topic_arn = topic_conf.set_config() @@ -1581,6 +1591,20 @@ def test_notification_push_kafka(): notification_push('kafka', conn) +@attr('kafka_failover') +def test_notification_push_kafka_multiple_brokers_override(): + """ test pushing kafka s3 notification on master """ + conn = connection() + notification_push('kafka', conn, kafka_brokers='localhost:9092,localhost:19092') + + +@attr('kafka_failover') +def test_notification_push_kafka_multiple_brokers_append(): + """ test pushing kafka s3 notification on master """ + conn = connection() + notification_push('kafka', conn, kafka_brokers='localhost:19092') + + @attr('http_test') def test_ps_s3_notification_multi_delete_on_master(): """ test deletion of multiple keys on master """ @@ -2981,7 +3005,6 @@ def wait_for_queue_to_drain(topic_name, tenant=None, account=None, http_port=Non log.info('waited for %ds for queue %s to drain', time_diff, topic_name) -@attr('kafka_test') def persistent_topic_stats(conn, endpoint_type): zonegroup = get_config_zonegroup() @@ -2993,12 +3016,13 @@ def persistent_topic_stats(conn, endpoint_type): host = get_ip() task = None port = None + wrong_port = 1234 + endpoint_address = endpoint_type+'://'+host+':'+str(wrong_port) if endpoint_type == 'http': # create random port for the http server port = random.randint(10000, 20000) # start an http server in a separate thread receiver = HTTPServerWithEvents((host, port)) - endpoint_address = 'http://'+host+':'+str(port) endpoint_args = 'push-endpoint='+endpoint_address+'&persistent=true'+ \ '&retry_sleep_duration=1' elif endpoint_type == 'amqp': @@ -3006,23 +3030,18 @@ def persistent_topic_stats(conn, endpoint_type): exchange = 'ex1' task, receiver = create_amqp_receiver_thread(exchange, topic_name) task.start() - endpoint_address = 'amqp://' + host endpoint_args = 'push-endpoint='+endpoint_address+'&amqp-exchange='+exchange+'&amqp-ack-level=broker&persistent=true'+ \ '&retry_sleep_duration=1' elif endpoint_type == 'kafka': # start kafka receiver task, receiver = create_kafka_receiver_thread(topic_name) task.start() - endpoint_address = 'kafka://' + host endpoint_args = 'push-endpoint='+endpoint_address+'&kafka-ack-level=broker&persistent=true'+ \ '&retry_sleep_duration=1' else: return SkipTest('Unknown endpoint type: ' + endpoint_type) # create s3 topic - endpoint_address = 'kafka://' + host + ':1234' # wrong port - endpoint_args = 'push-endpoint='+endpoint_address+'&kafka-ack-level=broker&persistent=true'+ \ - '&retry_sleep_duration=1' topic_conf = PSTopicS3(conn, topic_name, zonegroup, endpoint_args=endpoint_args) topic_arn = topic_conf.set_config() # create s3 notification @@ -3070,9 +3089,19 @@ def persistent_topic_stats(conn, endpoint_type): get_stats_persistent_topic(topic_name, 2 * number_of_objects) # change the endpoint port - endpoint_address = 'kafka://' + host - endpoint_args = 'push-endpoint='+endpoint_address+'&kafka-ack-level=broker&persistent=true'+ \ - '&retry_sleep_duration=1' + if endpoint_type == 'http': + endpoint_address = endpoint_type+'://'+host+':'+str(port) + endpoint_args = 'push-endpoint='+endpoint_address+'&persistent=true'+ \ + '&retry_sleep_duration=1' + elif endpoint_type == 'amqp': + endpoint_address = endpoint_type+'://'+host + endpoint_args = 'push-endpoint='+endpoint_address+'&amqp-exchange='+exchange+'&amqp-ack-level=broker&persistent=true'+ \ + '&retry_sleep_duration=1' + elif endpoint_type == 'kafka': + endpoint_address = endpoint_type+'://'+host + endpoint_args = 'push-endpoint='+endpoint_address+'&kafka-ack-level=broker&persistent=true'+ \ + '&retry_sleep_duration=1' + topic_conf = PSTopicS3(conn, topic_name, zonegroup, endpoint_args=endpoint_args) topic_arn = topic_conf.set_config() @@ -3087,19 +3116,26 @@ def persistent_topic_stats(conn, endpoint_type): @attr('http_test') -def persistent_topic_stats_http(): +def test_persistent_topic_stats_http(): """ test persistent topic stats, http endpoint """ conn = connection() persistent_topic_stats(conn, 'http') @attr('kafka_test') -def persistent_topic_stats_kafka(): +def test_persistent_topic_stats_kafka(): """ test persistent topic stats, kafka endpoint """ conn = connection() persistent_topic_stats(conn, 'kafka') +@attr('amqp_test') +def test_persistent_topic_stats_amqp(): + """ test persistent topic stats, amqp endpoint """ + conn = connection() + persistent_topic_stats(conn, 'amqp') + + @attr('kafka_test') def test_persistent_topic_dump(): """ test persistent topic dump """ diff --git a/src/test/rgw/rgw_multi/tests.py b/src/test/rgw/rgw_multi/tests.py index d95feb5aa95..433cd034fe0 100644 --- a/src/test/rgw/rgw_multi/tests.py +++ b/src/test/rgw/rgw_multi/tests.py @@ -15,6 +15,7 @@ import boto import boto.s3.connection from boto.s3.website import WebsiteConfiguration from boto.s3.cors import CORSConfiguration +from botocore.exceptions import ClientError from nose.tools import eq_ as eq from nose.tools import assert_not_equal, assert_equal, assert_true, assert_false @@ -3638,4 +3639,23 @@ def test_copy_object_different_bucket(): CopySource = source_bucket.name + '/' + objname) zonegroup_bucket_checkpoint(zonegroup_conns, dest_bucket.name) - + +def test_bucket_create_location_constraint(): + for zonegroup in realm.current_period.zonegroups: + zonegroup_conns = ZonegroupConns(zonegroup) + for zg in realm.current_period.zonegroups: + z = zonegroup_conns.rw_zones[0] + bucket_name = gen_bucket_name() + if zg.name == zonegroup.name: + # my zonegroup should pass + z.s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': zg.name}) + # check bucket location + response = z.s3_client.get_bucket_location(Bucket=bucket_name) + assert_equal(response['LocationConstraint'], zg.name) + else: + # other zonegroup should fail with 400 + e = assert_raises(ClientError, + z.s3_client.create_bucket, + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': zg.name}) + assert e.response['ResponseMetadata']['HTTPStatusCode'] == 400 diff --git a/src/test/rgw/test_rgw_iam_policy.cc b/src/test/rgw/test_rgw_iam_policy.cc index 7dadb7812ff..1d13c2aa013 100644 --- a/src/test/rgw/test_rgw_iam_policy.cc +++ b/src/test/rgw/test_rgw_iam_policy.cc @@ -75,6 +75,8 @@ using rgw::IAM::s3GetObjectTagging; using rgw::IAM::s3GetObjectVersion; using rgw::IAM::s3GetObjectVersionTagging; using rgw::IAM::s3GetObjectVersionTorrent; +using rgw::IAM::s3GetObjectAttributes; +using rgw::IAM::s3GetObjectVersionAttributes; using rgw::IAM::s3GetPublicAccessBlock; using rgw::IAM::s3GetReplicationConfiguration; using rgw::IAM::s3ListAllMyBuckets; @@ -419,6 +421,8 @@ TEST_F(PolicyTest, Parse3) { act2[s3GetObjectVersionAcl] = 1; act2[s3GetObjectTorrent] = 1; act2[s3GetObjectVersionTorrent] = 1; + act2[s3GetObjectAttributes] = 1; + act2[s3GetObjectVersionAttributes] = 1; act2[s3GetAccelerateConfiguration] = 1; act2[s3GetBucketAcl] = 1; act2[s3GetBucketOwnershipControls] = 1; @@ -487,6 +491,8 @@ TEST_F(PolicyTest, Eval3) { s3allow[s3GetObjectVersion] = 1; s3allow[s3GetObjectAcl] = 1; s3allow[s3GetObjectVersionAcl] = 1; + s3allow[s3GetObjectAttributes] = 1; + s3allow[s3GetObjectVersionAttributes] = 1; s3allow[s3GetObjectTorrent] = 1; s3allow[s3GetObjectVersionTorrent] = 1; s3allow[s3GetAccelerateConfiguration] = 1; @@ -883,6 +889,8 @@ TEST_F(ManagedPolicyTest, AmazonS3ReadOnlyAccess) act[s3GetObjectVersionAcl] = 1; act[s3GetObjectTorrent] = 1; act[s3GetObjectVersionTorrent] = 1; + act[s3GetObjectAttributes] = 1; + act[s3GetObjectVersionAttributes] = 1; act[s3GetAccelerateConfiguration] = 1; act[s3GetBucketAcl] = 1; act[s3GetBucketOwnershipControls] = 1; diff --git a/src/test/test_ipaddr.cc b/src/test/test_ipaddr.cc index 49038815318..21df1d4056b 100644 --- a/src/test/test_ipaddr.cc +++ b/src/test/test_ipaddr.cc @@ -995,3 +995,158 @@ TEST(pick_address, ipv4_ipv6_enabled2) ASSERT_EQ(-1, r); } } + +// Test for IPv4 address +TEST(is_addr_in_subnet, ipv4) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr; + addr.parse("10.1.1.2", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(true, r); +} + +// Test for IPv6 address +TEST(is_addr_in_subnet, ipv6) +{ + std::string public_network = "2001:db8::/64"; + entity_addr_t addr; + addr.parse("2001:db8::1", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv6", "true"); + cct->_conf.set_val("ms_bind_ipv4", "false"); + + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(true, r); +} + +// Test for invalid address +TEST(is_addr_in_subnet, invalid_address) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr; + addr.parse("192.168.1.1", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(false, r); +} + +// Test for malformed address +TEST(is_addr_in_subnet, malformed_address) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr; + addr.parse("invalid_address", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + // Test with a malformed address + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(false, r); +} + +TEST(is_addr_in_subnet, boundary_ipv4) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr_low; + addr_low.parse("10.1.1.0", nullptr); + entity_addr_t addr_high; + addr_high.parse("10.1.1.255", nullptr); + entity_addr_t addr_out; + addr_out.parse("10.1.2.0", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_low)); + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_high)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network, addr_out)); +} + +TEST(is_addr_in_subnet, boundary_ipv6) +{ + std::string public_network = "2001:db8::/64"; + entity_addr_t addr_low; + addr_low.parse("2001:db8::", nullptr); + entity_addr_t addr_high; + addr_high.parse("2001:db8:0:0:ffff:ffff:ffff:ffff", nullptr); + entity_addr_t addr_out; + addr_out.parse("2001:db9::", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv6", "true"); + cct->_conf.set_val("ms_bind_ipv4", "false"); + + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_low)); + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_high)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network, addr_out)); +} + +TEST(is_addr_in_subnet, overlapping_subnets) +{ + std::string public_network_1 = "10.1.1.0/24"; + std::string public_network_2 = "10.1.2.0/24"; + entity_addr_t addr; + addr.parse("10.1.1.5", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network_1, addr)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr)); +} + +TEST(is_addr_in_subnet, mismatched_family) +{ + std::string public_network_1 = "2001:db8::/64"; + entity_addr_t addr_1; + addr_1.parse("10.1.1.5", nullptr); + + std::string public_network_2 = "10.1.1.0/24"; + entity_addr_t addr_2; + addr_2.parse("2001:db8::1", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "true"); + + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_1, addr_1)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr_2)); +} + +TEST(is_addr_in_subnet, invalid_subnets) +{ + std::string public_network_1 = "10.1.1.0/33"; + std::string public_network_2 = "25.0.0.99/10"; + entity_addr_t addr; + addr.parse("10.1.1.2", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_1, addr)); // Invalid prefix + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr)); // Invalid subnet string +} + diff --git a/src/tools/cephfs/top/cephfs-top b/src/tools/cephfs/top/cephfs-top index 9ecc47fc2d5..45900f9a025 100755 --- a/src/tools/cephfs/top/cephfs-top +++ b/src/tools/cephfs/top/cephfs-top @@ -148,7 +148,7 @@ def wrap(s, sl): """return a '+' suffixed wrapped string""" if len(s) < sl: return s - return f'{s[0:sl-1]}+' + return f'{s[0:sl - 1]}+' class FSTopBase(object): diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc index 95c8725aa33..b20dca05bc6 100644 --- a/src/tools/rbd/Utils.cc +++ b/src/tools/rbd/Utils.cc @@ -337,11 +337,14 @@ int get_pool_image_snapshot_names(const po::variables_map &vm, SpecValidation spec_validation) { std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ? at::DEST_POOL_NAME : at::POOL_NAME); + std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ? + at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME); std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ? at::DEST_IMAGE_NAME : at::IMAGE_NAME); + return get_pool_generic_snapshot_names(vm, mod, spec_arg_index, pool_key, - pool_name, namespace_name, image_key, - "image", image_name, snap_name, + pool_name, namespace_key, namespace_name, + image_key, "image", image_name, snap_name, image_name_required, snapshot_presence, spec_validation); } @@ -351,6 +354,7 @@ int get_pool_generic_snapshot_names(const po::variables_map &vm, size_t *spec_arg_index, const std::string& pool_key, std::string *pool_name, + const std::string& namespace_key, std::string *namespace_name, const std::string& generic_key, const std::string& generic_key_desc, @@ -359,8 +363,6 @@ int get_pool_generic_snapshot_names(const po::variables_map &vm, bool generic_name_required, SnapshotPresence snapshot_presence, SpecValidation spec_validation) { - std::string namespace_key = (mod == at::ARGUMENT_MODIFIER_DEST ? - at::DEST_NAMESPACE_NAME : at::NAMESPACE_NAME); std::string snap_key = (mod == at::ARGUMENT_MODIFIER_DEST ? at::DEST_SNAPSHOT_NAME : at::SNAPSHOT_NAME); diff --git a/src/tools/rbd/Utils.h b/src/tools/rbd/Utils.h index 5076fd7fe9c..6aa0f2fdbdf 100644 --- a/src/tools/rbd/Utils.h +++ b/src/tools/rbd/Utils.h @@ -163,10 +163,11 @@ int get_pool_generic_snapshot_names( const boost::program_options::variables_map &vm, argument_types::ArgumentModifier mod, size_t *spec_arg_index, const std::string& pool_key, std::string *pool_name, - std::string *namespace_name, const std::string& generic_key, - const std::string& generic_key_desc, std::string *generic_name, - std::string *snap_name, bool generic_name_required, - SnapshotPresence snapshot_presence, SpecValidation spec_validation); + const std::string& namespace_key, std::string *namespace_name, + const std::string& generic_key, const std::string& generic_key_desc, + std::string *generic_name, std::string *snap_name, + bool generic_name_required, SnapshotPresence snapshot_presence, + SpecValidation spec_validation); int get_pool_image_id(const boost::program_options::variables_map &vm, size_t *spec_arg_index, diff --git a/src/tools/rbd/action/Group.cc b/src/tools/rbd/action/Group.cc index d97e120d438..100bdc19496 100644 --- a/src/tools/rbd/action/Group.cc +++ b/src/tools/rbd/action/Group.cc @@ -28,6 +28,9 @@ static const std::string DEST_GROUP_NAME("dest-group"); static const std::string GROUP_POOL_NAME("group-" + at::POOL_NAME); static const std::string IMAGE_POOL_NAME("image-" + at::POOL_NAME); +static const std::string GROUP_NAMESPACE_NAME("group-" + at::NAMESPACE_NAME); +static const std::string IMAGE_NAMESPACE_NAME("image-" + at::NAMESPACE_NAME); + void add_group_option(po::options_description *opt, at::ArgumentModifier modifier) { std::string name = GROUP_NAME; @@ -107,8 +110,8 @@ int execute_create(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, - utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -187,8 +190,8 @@ int execute_remove(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, - utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -221,8 +224,8 @@ int execute_rename(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, - utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -233,9 +236,9 @@ int execute_rename(const po::variables_map &vm, r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, at::DEST_POOL_NAME, - &dest_pool_name, &dest_namespace_name, DEST_GROUP_NAME, "group", - &dest_group_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, - utils::SPEC_VALIDATION_FULL); + &dest_pool_name, at::DEST_NAMESPACE_NAME, &dest_namespace_name, + DEST_GROUP_NAME, "group", &dest_group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -283,8 +286,8 @@ int execute_info(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, - utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -335,8 +338,9 @@ int execute_add(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME, - &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name, - nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + &group_pool_name, GROUP_NAMESPACE_NAME, &group_namespace_name, + GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -347,9 +351,9 @@ int execute_add(const po::variables_map &vm, r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME, - &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image", - &image_name, nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, - utils::SPEC_VALIDATION_FULL); + &image_pool_name, IMAGE_NAMESPACE_NAME, &image_namespace_name, + at::IMAGE_NAME, "image", &image_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -393,8 +397,9 @@ int execute_remove_image(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, GROUP_POOL_NAME, - &group_pool_name, &group_namespace_name, GROUP_NAME, "group", &group_name, - nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + &group_pool_name, GROUP_NAMESPACE_NAME, &group_namespace_name, + GROUP_NAME, "group", &group_name, nullptr, true, + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -410,9 +415,9 @@ int execute_remove_image(const po::variables_map &vm, r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, IMAGE_POOL_NAME, - &image_pool_name, &image_namespace_name, at::IMAGE_NAME, "image", - &image_name, nullptr, image_id.empty(), utils::SNAPSHOT_PRESENCE_NONE, - utils::SPEC_VALIDATION_FULL); + &image_pool_name, IMAGE_NAMESPACE_NAME, &image_namespace_name, + at::IMAGE_NAME, "image", &image_name, nullptr, image_id.empty(), + utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -464,8 +469,8 @@ int execute_list_images(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, - utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -563,8 +568,9 @@ int execute_group_snap_create(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true, - utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -604,8 +610,9 @@ int execute_group_snap_remove(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true, - utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -640,8 +647,9 @@ int execute_group_snap_rename(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, &source_snap_name, true, - utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + &source_snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -696,8 +704,8 @@ int execute_group_snap_list(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, nullptr, true, - utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + nullptr, true, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -764,8 +772,9 @@ int execute_group_snap_info(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, &group_snap_name, true, - utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + &group_snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -872,8 +881,9 @@ int execute_group_snap_rollback(const po::variables_map &vm, int r = utils::get_pool_generic_snapshot_names( vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, at::POOL_NAME, &pool_name, - &namespace_name, GROUP_NAME, "group", &group_name, &snap_name, true, - utils::SNAPSHOT_PRESENCE_REQUIRED, utils::SPEC_VALIDATION_FULL); + at::NAMESPACE_NAME, &namespace_name, GROUP_NAME, "group", &group_name, + &snap_name, true, utils::SNAPSHOT_PRESENCE_REQUIRED, + utils::SPEC_VALIDATION_FULL); if (r < 0) { return r; } @@ -954,9 +964,6 @@ void get_add_arguments(po::options_description *positional, add_prefixed_pool_option(options, "image"); add_prefixed_namespace_option(options, "image"); at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE); - - at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, - " unless overridden"); } void get_remove_image_arguments(po::options_description *positional, @@ -979,8 +986,6 @@ void get_remove_image_arguments(po::options_description *positional, add_prefixed_namespace_option(options, "image"); at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE); - at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, - " unless overridden"); at::add_image_id_option(options); } diff --git a/src/tools/rbd/action/MirrorPool.cc b/src/tools/rbd/action/MirrorPool.cc index 58e2d4dc329..6a546c3f73a 100644 --- a/src/tools/rbd/action/MirrorPool.cc +++ b/src/tools/rbd/action/MirrorPool.cc @@ -355,6 +355,10 @@ protected: virtual ~ImageRequestBase() { } + virtual bool open_read_only() const { + return false; + } + virtual bool skip_get_info() const { return false; } @@ -429,8 +433,13 @@ private: librbd::RBD rbd; auto aio_completion = utils::create_aio_completion< ImageRequestBase, &ImageRequestBase::handle_open_image>(this); - rbd.aio_open(m_io_ctx, m_image, m_image_name.c_str(), nullptr, - aio_completion); + if (open_read_only()) { + rbd.aio_open_read_only(m_io_ctx, m_image, m_image_name.c_str(), nullptr, + aio_completion); + } else { + rbd.aio_open(m_io_ctx, m_image, m_image_name.c_str(), nullptr, + aio_completion); + } } void handle_open_image(int r) { @@ -604,6 +613,10 @@ public: } protected: + bool open_read_only() const override { + return true; + } + bool skip_get_info() const override { return true; } diff --git a/src/vstart.sh b/src/vstart.sh index 41a8310891b..a992f33c856 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -159,6 +159,7 @@ smallmds=0 short=0 crimson=0 ec=0 +cephexporter=0 cephadm=0 parallel=true restart=1 @@ -233,6 +234,7 @@ options: -G disable Kerberos/GSSApi authentication --hitset <pool> <hit_set_type>: enable hitset tracking -e : create an erasure pool + --cephexporter: start the ceph-exporter daemon -o config add extra config parameters to all sections --rgw_port specify ceph rgw http listen port --rgw_frontend specify the rgw frontend configuration @@ -372,6 +374,9 @@ case $1 in -e) ec=1 ;; + --cephexporter) + cephexporter=1 + ;; --new | -n) new=1 ;; @@ -963,7 +968,17 @@ $BLUESTORE_OPTS ; kstore kstore fsck on mount = true +EOF + if [ "$crimson" -eq 1 ]; then + wconf <<EOF + crimson osd objectstore = $objectstore +EOF + else + wconf <<EOF osd objectstore = $objectstore +EOF + fi + wconf <<EOF $SEASTORE_OPTS $COSDSHORT $(format_conf "${extra_conf}") @@ -1130,6 +1145,17 @@ EOF fi } +start_cephexporter() { + debug echo "Starting Ceph exporter daemon..." + + # Define socket directory for the exporter + # Start the exporter daemon + prunb ceph-exporter \ + -c "$conf_fn" \ + --sock-dir "$CEPH_ASOK_DIR" \ + --addrs "$IP" +} + start_osd() { if [ $inc_osd_num -gt 0 ]; then old_maxosd=$($CEPH_BIN/ceph osd getmaxosd | sed -e 's/max_osd = //' -e 's/ in epoch.*//') @@ -1676,28 +1702,30 @@ if [ "$ceph_osd" == "crimson-osd" ]; then if [ "$trace" -ne 0 ]; then extra_seastar_args=" --trace" fi - if [ "$(expr $(nproc) - 1)" -gt "$(($CEPH_NUM_OSD * crimson_smp))" ]; then - if [ $crimson_alien_num_cores -gt 0 ]; then - alien_bottom_cpu=$(($CEPH_NUM_OSD * crimson_smp)) - alien_top_cpu=$(( alien_bottom_cpu + crimson_alien_num_cores - 1 )) - # Ensure top value within range: - if [ "$(($alien_top_cpu))" -gt "$(expr $(nproc) - 1)" ]; then - alien_top_cpu=$(expr $(nproc) - 1) + if [ "$objectstore" == "bluestore" ]; then + if [ "$(expr $(nproc) - 1)" -gt "$(($CEPH_NUM_OSD * crimson_smp))" ]; then + if [ $crimson_alien_num_cores -gt 0 ]; then + alien_bottom_cpu=$(($CEPH_NUM_OSD * crimson_smp)) + alien_top_cpu=$(( alien_bottom_cpu + crimson_alien_num_cores - 1 )) + # Ensure top value within range: + if [ "$(($alien_top_cpu))" -gt "$(expr $(nproc) - 1)" ]; then + alien_top_cpu=$(expr $(nproc) - 1) + fi + echo "crimson_alien_thread_cpu_cores: $alien_bottom_cpu-$alien_top_cpu" + # This is a (logical) processor id range, it could be refined to encompass only physical processor ids + # (equivalently, ignore hyperthreading sibling processor ids) + $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores "$alien_bottom_cpu-$alien_top_cpu" + else + echo "crimson_alien_thread_cpu_cores:" $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)" + $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)" + fi + if [ $crimson_alien_num_threads -gt 0 ]; then + echo "$CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_op_num_threads $crimson_alien_num_threads" + $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_op_num_threads "$crimson_alien_num_threads" fi - echo "crimson_alien_thread_cpu_cores: $alien_bottom_cpu-$alien_top_cpu" - # This is a (logical) processor id range, it could be refined to encompass only physical processor ids - # (equivalently, ignore hyperthreading sibling processor ids) - $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores "$alien_bottom_cpu-$alien_top_cpu" else - echo "crimson_alien_thread_cpu_cores:" $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)" - $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores $(($CEPH_NUM_OSD * crimson_smp))-"$(expr $(nproc) - 1)" - fi - if [ $crimson_alien_num_threads -gt 0 ]; then - echo "$CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_op_num_threads $crimson_alien_num_threads" - $CEPH_BIN/ceph -c $conf_fn config set osd crimson_alien_op_num_threads "$crimson_alien_num_threads" + echo "No alien thread cpu core isolation" fi - else - echo "No alien thread cpu core isolation" fi fi @@ -1726,6 +1754,10 @@ if [ $CEPH_NUM_MDS -gt 0 ]; then ceph_adm fs authorize \* "client.fs" / rwp >> "$keyring_fn" fi +if [ "$cephexporter" -eq 1 ]; then + start_cephexporter +fi + # Don't set max_mds until all the daemons are started, otherwise # the intended standbys might end up in active roles. if [ "$CEPH_MAX_MDS" -gt 1 ]; then |