diff options
Diffstat (limited to 'src')
180 files changed, 7329 insertions, 3036 deletions
diff --git a/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/src/ceph-volume/ceph_volume/devices/lvm/zap.py index c278de43eb0..a6d82c7f0fa 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/zap.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/zap.py @@ -119,13 +119,12 @@ class Zap: osd_uuid = details.get('osd_uuid') break - for osd_uuid, details in raw_report.items(): + for _, details in raw_report.items(): device: str = details.get('device') if details.get('osd_uuid') == osd_uuid: raw_devices.add(device) return list(raw_devices) - def find_associated_devices(self) -> List[api.Volume]: """From an ``osd_id`` and/or an ``osd_fsid``, filter out all the Logical Volumes (LVs) in the diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py index cca64e83ab0..c971b7776ef 100644 --- a/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py +++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/data_zap.py @@ -22,7 +22,7 @@ ceph_bluestore_tool_output = ''' "whoami": "0" }, "/dev/vdx": { - "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b6", + "osd_uuid": "d5a496bc-dcb9-4ad0-a12c-393d3200d2b7", "size": 214748364800, "btime": "2024-10-16T10:51:05.955279+0000", "description": "main", diff --git a/src/ceph-volume/ceph_volume/util/disk.py b/src/ceph-volume/ceph_volume/util/disk.py index 77b55314f66..921e61a4534 100644 --- a/src/ceph-volume/ceph_volume/util/disk.py +++ b/src/ceph-volume/ceph_volume/util/disk.py @@ -347,12 +347,21 @@ def lsblk_all(device: str = '', return result -def is_device(dev): +def is_device(dev: str) -> bool: """ - Boolean to determine if a given device is a block device (**not** - a partition!) + Determines whether the given path corresponds to a block device (not a partition). - For example: /dev/sda would return True, but not /dev/sdc1 + This function checks whether the provided device path represents a valid block device, + such as a physical disk (/dev/sda) or an allowed loop device, but excludes partitions + (/dev/sdc1). It performs several validation steps, including file existence, path format, + device type, and additional checks for loop devices if allowed. + + Args: + dev (str): The path to the device (e.g., "/dev/sda"). + + Returns: + bool: True if the path corresponds to a valid block device (not a partition), + otherwise False. """ if not os.path.exists(dev): return False @@ -364,7 +373,7 @@ def is_device(dev): TYPE = lsblk(dev).get('TYPE') if TYPE: - return TYPE in ['disk', 'mpath'] + return TYPE in ['disk', 'mpath', 'loop'] # fallback to stat return _stat_is_device(os.lstat(dev).st_mode) and not is_partition(dev) diff --git a/src/ceph-volume/ceph_volume/util/prepare.py b/src/ceph-volume/ceph_volume/util/prepare.py index 9c863b83d93..ff7fc023fc4 100644 --- a/src/ceph-volume/ceph_volume/util/prepare.py +++ b/src/ceph-volume/ceph_volume/util/prepare.py @@ -9,6 +9,7 @@ import logging import json from ceph_volume import process, conf, terminal from ceph_volume.util import system, constants, str_to_int, disk +from typing import Optional logger = logging.getLogger(__name__) mlogger = terminal.MultiLogger(__name__) @@ -121,7 +122,7 @@ def get_block_wal_size(lv_format=True): return wal_size -def create_id(fsid, json_secrets, osd_id=None): +def create_id(fsid: str, json_secrets: str, osd_id: Optional[str]=None) -> str: """ :param fsid: The osd fsid to create, always required :param json_secrets: a json-ready object with whatever secrets are wanted diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index d2ddf564116..a8616980e4d 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -111,6 +111,7 @@ from cephadmlib.file_utils import ( unlink_file, write_new, write_tmp, + update_meta_file, ) from cephadmlib.net_utils import ( build_addrv_params, @@ -3453,6 +3454,7 @@ def list_daemons( detail: bool = True, legacy_dir: Optional[str] = None, daemon_name: Optional[str] = None, + type_of_daemon: Optional[str] = None, ) -> List[Dict[str, str]]: host_version: Optional[str] = None ls = [] @@ -3489,6 +3491,8 @@ def list_daemons( if os.path.exists(data_dir): for i in os.listdir(data_dir): if i in ['mon', 'osd', 'mds', 'mgr', 'rgw']: + if type_of_daemon and type_of_daemon != i: + continue daemon_type = i for j in os.listdir(os.path.join(data_dir, i)): if '-' not in j: @@ -3525,6 +3529,8 @@ def list_daemons( if daemon_name and name != daemon_name: continue (daemon_type, daemon_id) = j.split('.', 1) + if type_of_daemon and type_of_daemon != daemon_type: + continue unit_name = get_unit_name(fsid, daemon_type, daemon_id) @@ -4705,6 +4711,34 @@ def command_list_images(ctx: CephadmContext) -> None: # print default images cp_obj.write(sys.stdout) + +def update_service_for_daemon(ctx: CephadmContext, + available_daemons: list, + update_daemons: list) -> None: + """ Update the unit.meta file of daemon with required service name for valid daemons""" + + data = {'service_name': ctx.service_name} + # check if all the daemon names are valid + if not set(update_daemons).issubset(set(available_daemons)): + raise Error(f'Error EINVAL: one or more daemons of {update_daemons} does not exist on this host') + for name in update_daemons: + path = os.path.join(ctx.data_dir, ctx.fsid, name, 'unit.meta') + update_meta_file(path, data) + print(f'Successfully updated daemon {name} with service {ctx.service_name}') + + +@infer_fsid +def command_update_osd_service(ctx: CephadmContext) -> int: + """update service for provided daemon""" + update_daemons = [f'osd.{osd_id}' for osd_id in ctx.osd_ids.split(',')] + daemons = list_daemons(ctx, detail=False, type_of_daemon='osd') + if not daemons: + raise Error(f'Daemon {ctx.osd_ids} does not exists on this host') + available_daemons = [d['name'] for d in daemons] + update_service_for_daemon(ctx, available_daemons, update_daemons) + return 0 + + ################################## @@ -5571,6 +5605,14 @@ def _get_parser(): parser_list_images = subparsers.add_parser( 'list-images', help='list all the default images') parser_list_images.set_defaults(func=command_list_images) + + parser_update_service = subparsers.add_parser( + 'update-osd-service', help='update service for provided daemon') + parser_update_service.set_defaults(func=command_update_osd_service) + parser_update_service.add_argument('--fsid', help='cluster FSID') + parser_update_service.add_argument('--osd-ids', required=True, help='Comma-separated OSD IDs') + parser_update_service.add_argument('--service-name', required=True, help='OSD service name') + return parser diff --git a/src/cephadm/cephadmlib/daemons/monitoring.py b/src/cephadm/cephadmlib/daemons/monitoring.py index 9a9402632b0..4ba00daaefb 100644 --- a/src/cephadm/cephadmlib/daemons/monitoring.py +++ b/src/cephadm/cephadmlib/daemons/monitoring.py @@ -16,7 +16,13 @@ from ..daemon_form import register as register_daemon_form from ..daemon_identity import DaemonIdentity from ..deployment_utils import to_deployment_container from ..exceptions import Error -from ..net_utils import get_fqdn, get_hostname, get_ip_addresses, wrap_ipv6 +from ..net_utils import ( + get_fqdn, + get_hostname, + get_ip_addresses, + wrap_ipv6, + EndPoint, +) @register_daemon_form @@ -89,11 +95,6 @@ class Monitoring(ContainerDaemonForm): 'image': DefaultImages.ALERTMANAGER.image_ref, 'cpus': '2', 'memory': '2GB', - 'args': [ - '--cluster.listen-address=:{}'.format( - port_map['alertmanager'][1] - ), - ], 'config-json-files': [ 'alertmanager.yml', ], @@ -248,11 +249,14 @@ class Monitoring(ContainerDaemonForm): ip = meta['ip'] if 'ports' in meta and meta['ports']: port = meta['ports'][0] - if daemon_type == 'prometheus': - config = fetch_configs(ctx) + config = fetch_configs(ctx) + if daemon_type in ['prometheus', 'alertmanager']: ip_to_bind_to = config.get('ip_to_bind_to', '') if ip_to_bind_to: ip = ip_to_bind_to + web_listen_addr = str(EndPoint(ip, port)) + r += [f'--web.listen-address={web_listen_addr}'] + if daemon_type == 'prometheus': retention_time = config.get('retention_time', '15d') retention_size = config.get( 'retention_size', '0' @@ -276,9 +280,11 @@ class Monitoring(ContainerDaemonForm): r += ['--web.route-prefix=/prometheus/'] else: r += [f'--web.external-url={scheme}://{host}:{port}'] - r += [f'--web.listen-address={ip}:{port}'] if daemon_type == 'alertmanager': - config = fetch_configs(ctx) + clus_listen_addr = str( + EndPoint(ip, self.port_map[daemon_type][1]) + ) + r += [f'--cluster.listen-address={clus_listen_addr}'] use_url_prefix = config.get('use_url_prefix', False) peers = config.get('peers', list()) # type: ignore for peer in peers: @@ -294,13 +300,11 @@ class Monitoring(ContainerDaemonForm): if daemon_type == 'promtail': r += ['--config.expand-env'] if daemon_type == 'prometheus': - config = fetch_configs(ctx) try: r += [f'--web.config.file={config["web_config"]}'] except KeyError: pass if daemon_type == 'node-exporter': - config = fetch_configs(ctx) try: r += [f'--web.config.file={config["web_config"]}'] except KeyError: diff --git a/src/cephadm/cephadmlib/file_utils.py b/src/cephadm/cephadmlib/file_utils.py index 27e70e31756..4dd88cc3671 100644 --- a/src/cephadm/cephadmlib/file_utils.py +++ b/src/cephadm/cephadmlib/file_utils.py @@ -5,6 +5,7 @@ import datetime import logging import os import tempfile +import json from contextlib import contextmanager from pathlib import Path @@ -157,3 +158,26 @@ def unlink_file( except Exception: if not ignore_errors: raise + + +def update_meta_file(file_path: str, update_key_val: dict) -> None: + """Update key in the file with provided value""" + try: + with open(file_path, 'r') as fh: + data = json.load(fh) + file_stat = os.stat(file_path) + except FileNotFoundError: + raise + except Exception: + logger.exception(f'Failed to update {file_path}') + raise + data.update( + {key: value for key, value in update_key_val.items() if key in data} + ) + + with write_new( + file_path, + owner=(file_stat.st_uid, file_stat.st_gid), + perms=(file_stat.st_mode & 0o777), + ) as fh: + fh.write(json.dumps(data, indent=4) + '\n') diff --git a/src/cephadm/cephadmlib/net_utils.py b/src/cephadm/cephadmlib/net_utils.py index 9a7f138b1c6..bfa61d933ef 100644 --- a/src/cephadm/cephadmlib/net_utils.py +++ b/src/cephadm/cephadmlib/net_utils.py @@ -24,12 +24,22 @@ class EndPoint: def __init__(self, ip: str, port: int) -> None: self.ip = ip self.port = port + self.is_ipv4 = True + try: + if ip and ipaddress.ip_network(ip).version == 6: + self.is_ipv4 = False + except Exception: + logger.exception('Failed to check ip address version') def __str__(self) -> str: - return f'{self.ip}:{self.port}' + if self.is_ipv4: + return f'{self.ip}:{self.port}' + return f'[{self.ip}]:{self.port}' def __repr__(self) -> str: - return f'{self.ip}:{self.port}' + if self.is_ipv4: + return f'{self.ip}:{self.port}' + return f'[{self.ip}]:{self.port}' def attempt_bind(ctx, s, address, port): diff --git a/src/cephadm/tests/test_deploy.py b/src/cephadm/tests/test_deploy.py index c5094db335f..1736639ed55 100644 --- a/src/cephadm/tests/test_deploy.py +++ b/src/cephadm/tests/test_deploy.py @@ -316,7 +316,7 @@ def test_deploy_a_monitoring_container(cephadm_fs, funkypatch): runfile_lines = f.read().splitlines() assert 'podman' in runfile_lines[-1] assert runfile_lines[-1].endswith( - 'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095 --web.listen-address=1.2.3.4:9095' + 'quay.io/titans/prometheus:latest --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.listen-address=1.2.3.4:9095 --storage.tsdb.retention.time=15d --storage.tsdb.retention.size=0 --web.external-url=http://10.10.10.10:9095' ) assert '--user 8765' in runfile_lines[-1] assert f'-v /var/lib/ceph/{fsid}/prometheus.fire/etc/prometheus:/etc/prometheus:Z' in runfile_lines[-1] diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index ea3cce16609..c607839a8d2 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -13,6 +13,7 @@ if(WIN32) endif() add_subdirectory(io_exerciser) +add_subdirectory(json) add_subdirectory(options) set(common_srcs diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp index 961d9a0192e..c5fd491ed29 100644 --- a/src/common/bit_vector.hpp +++ b/src/common/bit_vector.hpp @@ -29,8 +29,8 @@ private: static const uint8_t MASK = static_cast<uint8_t>((1 << _bit_count) - 1); // must be power of 2 - BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1))); - BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE); + static_assert((_bit_count != 0) && !(_bit_count & (_bit_count - 1))); + static_assert(_bit_count <= BITS_PER_BYTE); template <typename DataIterator> class ReferenceImpl { diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h index 01feff4c063..0b05be5372e 100644 --- a/src/common/ceph_time.h +++ b/src/common/ceph_time.h @@ -342,6 +342,23 @@ public: } }; +// Please note time_guard is not thread safety -- multiple threads +// updating same diff_accumulator can corrupt it. +template <class ClockT = mono_clock> +class time_guard { + const typename ClockT::time_point start; + timespan& diff_accumulator; + +public: + time_guard(timespan& diff_accumulator) + : start(ClockT::now()), + diff_accumulator(diff_accumulator) { + } + ~time_guard() { + diff_accumulator += ClockT::now() - start; + } +}; + namespace time_detail { // So that our subtractions produce negative spans rather than // arithmetic underflow. diff --git a/src/common/io_exerciser/CMakeLists.txt b/src/common/io_exerciser/CMakeLists.txt index 07091df86e1..ab2e64fc222 100644 --- a/src/common/io_exerciser/CMakeLists.txt +++ b/src/common/io_exerciser/CMakeLists.txt @@ -5,9 +5,11 @@ add_library(object_io_exerciser STATIC Model.cc ObjectModel.cc RadosIo.cc + EcIoSequence.cc ) target_link_libraries(object_io_exerciser - librados + librados global + json_structures )
\ No newline at end of file diff --git a/src/common/io_exerciser/DataGenerator.cc b/src/common/io_exerciser/DataGenerator.cc index 9aa77eeb6e9..701c32fa9ec 100644 --- a/src/common/io_exerciser/DataGenerator.cc +++ b/src/common/io_exerciser/DataGenerator.cc @@ -2,32 +2,28 @@ // vim: ts=8 sw=2 smarttab #include "DataGenerator.h" -#include "ObjectModel.h" +#include <chrono> +#include <iostream> +#include <stdexcept> +#include "ObjectModel.h" #include "common/debug.h" #include "common/dout.h" - #include "fmt/format.h" #include "fmt/ranges.h" -#include <chrono> -#include <iostream> -#include <stdexcept> - #define dout_subsys ceph_subsys_rados #define dout_context g_ceph_context using DataGenerator = ceph::io_exerciser::data_generation::DataGenerator; -using SeededRandomGenerator = ceph::io_exerciser::data_generation - ::SeededRandomGenerator; -using HeaderedSeededRandomGenerator = ceph::io_exerciser::data_generation - ::HeaderedSeededRandomGenerator; +using SeededRandomGenerator = + ceph::io_exerciser::data_generation ::SeededRandomGenerator; +using HeaderedSeededRandomGenerator = + ceph::io_exerciser::data_generation ::HeaderedSeededRandomGenerator; std::unique_ptr<DataGenerator> DataGenerator::create_generator( - GenerationType generationType, const ObjectModel& model) -{ - switch(generationType) - { + GenerationType generationType, const ObjectModel& model) { + switch (generationType) { case GenerationType::SeededRandom: return std::make_unique<SeededRandomGenerator>(model); case GenerationType::HeaderedSeededRandom: @@ -39,28 +35,25 @@ std::unique_ptr<DataGenerator> DataGenerator::create_generator( return nullptr; } -bufferlist DataGenerator::generate_wrong_data(uint64_t offset, uint64_t length) -{ +bufferlist DataGenerator::generate_wrong_data(uint64_t offset, + uint64_t length) { bufferlist retlist; uint64_t block_size = m_model.get_block_size(); char buffer[block_size]; - for (uint64_t block_offset = offset; - block_offset < offset + length; - block_offset++) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { std::memset(buffer, 0, block_size); retlist.append(ceph::bufferptr(buffer, block_size)); } return retlist; } -bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, uint64_t length) -{ +bool DataGenerator::validate(bufferlist& bufferlist, uint64_t offset, + uint64_t length) { return bufferlist.contents_equal(generate_data(offset, length)); } -ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) -{ +ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) { uint64_t block_size = m_model.get_block_size(); char buffer[block_size]; @@ -70,29 +63,26 @@ ceph::bufferptr SeededRandomGenerator::generate_block(uint64_t block_offset) constexpr size_t generation_length = sizeof(uint64_t); - for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--) - { + for (uint64_t i = 0; i < block_size; + i += (2 * generation_length), rand1++, rand2--) { std::memcpy(buffer + i, &rand1, generation_length); std::memcpy(buffer + i + generation_length, &rand2, generation_length); } size_t remainingBytes = block_size % (generation_length * 2); - if (remainingBytes > generation_length) - { + if (remainingBytes > generation_length) { size_t remainingBytes2 = remainingBytes - generation_length; std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2); - } - else if (remainingBytes > 0) - { + } else if (remainingBytes > 0) { std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); } return ceph::bufferptr(buffer, block_size); } -ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offset) -{ +ceph::bufferptr SeededRandomGenerator::generate_wrong_block( + uint64_t block_offset) { uint64_t block_size = m_model.get_block_size(); char buffer[block_size]; @@ -102,141 +92,134 @@ ceph::bufferptr SeededRandomGenerator::generate_wrong_block(uint64_t block_offse constexpr size_t generation_length = sizeof(uint64_t); - for (uint64_t i = 0; i < block_size; i+=(2*generation_length), rand1++, rand2--) - { + for (uint64_t i = 0; i < block_size; + i += (2 * generation_length), rand1++, rand2--) { std::memcpy(buffer + i, &rand1, generation_length); std::memcpy(buffer + i + generation_length, &rand2, generation_length); } size_t remainingBytes = block_size % (generation_length * 2); - if (remainingBytes > generation_length) - { + if (remainingBytes > generation_length) { size_t remainingBytes2 = remainingBytes - generation_length; std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); std::memcpy(buffer + block_size - remainingBytes2, &rand2, remainingBytes2); - } - else if (remainingBytes > 0) - { + } else if (remainingBytes > 0) { std::memcpy(buffer + block_size - remainingBytes, &rand1, remainingBytes); } return ceph::bufferptr(buffer, block_size); } -bufferlist SeededRandomGenerator::generate_data(uint64_t offset, uint64_t length) -{ +bufferlist SeededRandomGenerator::generate_data(uint64_t offset, + uint64_t length) { bufferlist retlist; - for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { retlist.append(generate_block(block_offset)); } return retlist; } -bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, uint64_t length) -{ +bufferlist SeededRandomGenerator::generate_wrong_data(uint64_t offset, + uint64_t length) { bufferlist retlist; - for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { retlist.append(generate_wrong_block(block_offset)); } return retlist; } -HeaderedSeededRandomGenerator - ::HeaderedSeededRandomGenerator(const ObjectModel& model, - std::optional<uint64_t> unique_run_id) : - SeededRandomGenerator(model), - unique_run_id(unique_run_id.value_or(generate_unique_run_id())) -{ - -} +HeaderedSeededRandomGenerator ::HeaderedSeededRandomGenerator( + const ObjectModel& model, std::optional<uint64_t> unique_run_id) + : SeededRandomGenerator(model), + unique_run_id(unique_run_id.value_or(generate_unique_run_id())) {} -uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id() -{ +uint64_t HeaderedSeededRandomGenerator::generate_unique_run_id() { std::mt19937_64 random_generator = - std::mt19937_64(duration_cast<std::chrono::milliseconds>( - std::chrono::system_clock::now().time_since_epoch()).count()); + std::mt19937_64(duration_cast<std::chrono::milliseconds>( + std::chrono::system_clock::now().time_since_epoch()) + .count()); - return random_generator(); + return random_generator(); } -ceph::bufferptr HeaderedSeededRandomGenerator::generate_block(uint64_t block_offset) -{ +ceph::bufferptr HeaderedSeededRandomGenerator::generate_block( + uint64_t block_offset) { SeedBytes seed = m_model.get_seed(block_offset); - TimeBytes current_time = duration_cast<std::chrono::milliseconds>( - std::chrono::system_clock::now().time_since_epoch()).count(); + TimeBytes current_time = + duration_cast<std::chrono::milliseconds>( + std::chrono::system_clock::now().time_since_epoch()) + .count(); - ceph::bufferptr bufferptr = SeededRandomGenerator::generate_block(block_offset); + ceph::bufferptr bufferptr = + SeededRandomGenerator::generate_block(block_offset); - std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, uniqueIdLength()); + std::memcpy(bufferptr.c_str() + uniqueIdStart(), &unique_run_id, + uniqueIdLength()); std::memcpy(bufferptr.c_str() + seedStart(), &seed, seedLength()); std::memcpy(bufferptr.c_str() + timeStart(), ¤t_time, timeLength()); return bufferptr; } -ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block(uint64_t block_offset) -{ +ceph::bufferptr HeaderedSeededRandomGenerator::generate_wrong_block( + uint64_t block_offset) { return HeaderedSeededRandomGenerator::generate_block(block_offset % 8); } const HeaderedSeededRandomGenerator::UniqueIdBytes - HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset, - const bufferlist& bufferlist) -{ +HeaderedSeededRandomGenerator::readUniqueRunId(uint64_t block_offset, + const bufferlist& bufferlist) { UniqueIdBytes read_unique_run_id = 0; - std::memcpy(&read_unique_run_id, - &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()], - uniqueIdLength()); + std::memcpy( + &read_unique_run_id, + &bufferlist[(block_offset * m_model.get_block_size()) + uniqueIdStart()], + uniqueIdLength()); return read_unique_run_id; } const HeaderedSeededRandomGenerator::SeedBytes - HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset, - const bufferlist& bufferlist) -{ +HeaderedSeededRandomGenerator::readSeed(uint64_t block_offset, + const bufferlist& bufferlist) { SeedBytes read_seed = 0; - std::memcpy(&read_seed, - &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()], - seedLength()); + std::memcpy( + &read_seed, + &bufferlist[(block_offset * m_model.get_block_size()) + seedStart()], + seedLength()); return read_seed; } const HeaderedSeededRandomGenerator::TimeBytes - HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset, - const bufferlist& bufferlist) -{ +HeaderedSeededRandomGenerator::readDateTime(uint64_t block_offset, + const bufferlist& bufferlist) { TimeBytes read_time = 0; - std::memcpy(&read_time, - &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()], - timeLength()); + std::memcpy( + &read_time, + &bufferlist[(block_offset * m_model.get_block_size()) + timeStart()], + timeLength()); return read_time; } bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist, - uint64_t offset, uint64_t length) -{ + uint64_t offset, uint64_t length) { std::vector<uint64_t> invalid_block_offsets; - for (uint64_t block_offset = offset; block_offset < offset + length; block_offset++) - { - bool valid_block - = validate_block(block_offset, - (bufferlist.c_str() + ((block_offset - offset) * - m_model.get_block_size()))); - if (!valid_block) - { + for (uint64_t block_offset = offset; block_offset < offset + length; + block_offset++) { + bool valid_block = validate_block( + block_offset, (bufferlist.c_str() + + ((block_offset - offset) * m_model.get_block_size()))); + if (!valid_block) { invalid_block_offsets.push_back(block_offset); } } - if (!invalid_block_offsets.empty()) - { + if (!invalid_block_offsets.empty()) { printDebugInformationForOffsets(offset, invalid_block_offsets, bufferlist); } @@ -244,59 +227,51 @@ bool HeaderedSeededRandomGenerator::validate(bufferlist& bufferlist, } bool HeaderedSeededRandomGenerator::validate_block(uint64_t block_offset, - const char* buffer_start) -{ + const char* buffer_start) { // We validate the block matches what we generate byte for byte // however we ignore the time section of the header ceph::bufferptr bufferptr = generate_block(block_offset); bool valid = strncmp(bufferptr.c_str(), buffer_start, timeStart()) == 0; - valid = valid ? strncmp(bufferptr.c_str() + timeEnd(), - buffer_start + timeEnd(), - m_model.get_block_size() - timeEnd()) == 0 : valid; + valid = valid + ? strncmp(bufferptr.c_str() + timeEnd(), buffer_start + timeEnd(), + m_model.get_block_size() - timeEnd()) == 0 + : valid; return valid; } const HeaderedSeededRandomGenerator::ErrorType - HeaderedSeededRandomGenerator::getErrorTypeForBlock(uint64_t read_offset, - uint64_t block_offset, - const bufferlist& bufferlist) -{ - try - { - UniqueIdBytes read_unique_run_id = readUniqueRunId(block_offset - read_offset, - bufferlist); - if (unique_run_id != read_unique_run_id) - { +HeaderedSeededRandomGenerator::getErrorTypeForBlock( + uint64_t read_offset, uint64_t block_offset, const bufferlist& bufferlist) { + try { + UniqueIdBytes read_unique_run_id = + readUniqueRunId(block_offset - read_offset, bufferlist); + if (unique_run_id != read_unique_run_id) { return ErrorType::RUN_ID_MISMATCH; } SeedBytes read_seed = readSeed(block_offset - read_offset, bufferlist); - if (m_model.get_seed(block_offset) != read_seed) - { + if (m_model.get_seed(block_offset) != read_seed) { return ErrorType::SEED_MISMATCH; } if (std::strncmp(&bufferlist[((block_offset - read_offset) * - m_model.get_block_size()) + bodyStart()], + m_model.get_block_size()) + + bodyStart()], generate_block(block_offset).c_str() + bodyStart(), - m_model.get_block_size() - bodyStart()) != 0) - { + m_model.get_block_size() - bodyStart()) != 0) { return ErrorType::DATA_MISMATCH; } - } - catch(const std::exception& e) - { + } catch (const std::exception& e) { return ErrorType::DATA_NOT_FOUND; } return ErrorType::UNKNOWN; } -void HeaderedSeededRandomGenerator - ::printDebugInformationForBlock(uint64_t read_offset, uint64_t block_offset, - const bufferlist& bufferlist) -{ - ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, bufferlist); +void HeaderedSeededRandomGenerator ::printDebugInformationForBlock( + uint64_t read_offset, uint64_t block_offset, const bufferlist& bufferlist) { + ErrorType blockError = + getErrorTypeForBlock(read_offset, block_offset, bufferlist); TimeBytes read_time = 0; std::time_t ttp; @@ -304,433 +279,361 @@ void HeaderedSeededRandomGenerator char read_bytes[m_model.get_block_size()]; char generated_bytes[m_model.get_block_size()]; - if (blockError == ErrorType::DATA_MISMATCH || blockError == ErrorType::UNKNOWN) - { + if (blockError == ErrorType::DATA_MISMATCH || + blockError == ErrorType::UNKNOWN) { read_time = readDateTime(block_offset - read_offset, bufferlist); - std::chrono::system_clock::time_point time_point{std::chrono::milliseconds{read_time}}; + std::chrono::system_clock::time_point time_point{ + std::chrono::milliseconds{read_time}}; ttp = std::chrono::system_clock::to_time_t(time_point); - std::memcpy(&read_bytes, - &bufferlist[((block_offset - read_offset) * m_model.get_block_size())], - m_model.get_block_size() - bodyStart()); - std::memcpy(&generated_bytes, - generate_block(block_offset).c_str(), + std::memcpy( + &read_bytes, + &bufferlist[((block_offset - read_offset) * m_model.get_block_size())], + m_model.get_block_size() - bodyStart()); + std::memcpy(&generated_bytes, generate_block(block_offset).c_str(), m_model.get_block_size() - bodyStart()); } std::string error_string; - switch(blockError) - { - case ErrorType::RUN_ID_MISMATCH: - { - UniqueIdBytes read_unique_run_id = readUniqueRunId((block_offset - read_offset), - bufferlist); - error_string = fmt::format("Header (Run ID) mismatch detected at block {} " - "(byte offset {}) Header expected run id {} but found id {}. " - "Block data corrupt or not written from this instance of this application.", - block_offset, - block_offset * m_model.get_block_size(), - unique_run_id, - read_unique_run_id); - } - break; - - case ErrorType::SEED_MISMATCH: - { + switch (blockError) { + case ErrorType::RUN_ID_MISMATCH: { + UniqueIdBytes read_unique_run_id = + readUniqueRunId((block_offset - read_offset), bufferlist); + error_string = fmt::format( + "Header (Run ID) mismatch detected at block {} " + "(byte offset {}) Header expected run id {} but found id {}. " + "Block data corrupt or not written from this instance of this " + "application.", + block_offset, block_offset * m_model.get_block_size(), unique_run_id, + read_unique_run_id); + } break; + + case ErrorType::SEED_MISMATCH: { SeedBytes read_seed = readSeed((block_offset - read_offset), bufferlist); - if (m_model.get_seed_offsets(read_seed).size() == 0) - { - error_string = fmt::format("Data (Seed) mismatch detected at block {}" - " (byte offset {}). Header expected seed {} but found seed {}. " - "Read data was not from any other recognised block in the object.", - block_offset, - block_offset * m_model.get_block_size(), - m_model.get_seed(block_offset), - read_seed); - } - else - { + if (m_model.get_seed_offsets(read_seed).size() == 0) { + error_string = fmt::format( + "Data (Seed) mismatch detected at block {}" + " (byte offset {}). Header expected seed {} but found seed {}. " + "Read data was not from any other recognised block in the object.", + block_offset, block_offset * m_model.get_block_size(), + m_model.get_seed(block_offset), read_seed); + } else { std::vector<int> seed_offsets = m_model.get_seed_offsets(read_seed); - error_string = fmt::format("Data (Seed) mismatch detected at block {}" - " (byte offset {}). Header expected seed {} but found seed {}." - " Read data was from a different block(s): {}", - block_offset, - block_offset * m_model.get_block_size(), - m_model.get_seed(block_offset), - read_seed, + error_string = fmt::format( + "Data (Seed) mismatch detected at block {}" + " (byte offset {}). Header expected seed {} but found seed {}." + " Read data was from a different block(s): {}", + block_offset, block_offset * m_model.get_block_size(), + m_model.get_seed(block_offset), read_seed, fmt::join(seed_offsets.begin(), seed_offsets.end(), "")); } - } - break; - - case ErrorType::DATA_MISMATCH: - { - error_string = fmt::format("Data (Body) mismatch detected at block {}" - " (byte offset {}). Header data matches, data body does not." - " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}", - block_offset, - block_offset * m_model.get_block_size(), + } break; + + case ErrorType::DATA_MISMATCH: { + error_string = fmt::format( + "Data (Body) mismatch detected at block {}" + " (byte offset {}). Header data matches, data body does not." + " Data written at {}\nExpected data: \n{:02x}\nRead data:{:02x}", + block_offset, block_offset * m_model.get_block_size(), std::ctime(&ttp), - fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""), + fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), + ""), fmt::join(read_bytes, read_bytes + m_model.get_block_size(), "")); - } - break; + } break; - case ErrorType::DATA_NOT_FOUND: - { + case ErrorType::DATA_NOT_FOUND: { uint64_t bufferlist_length = bufferlist.to_str().size(); - error_string = fmt::format("Data (Body) could not be read at block {}" - " (byte offset {}) offset in bufferlist returned from read: {}" - " ({} bytes). Returned bufferlist length: {}.", - block_offset, - block_offset * m_model.get_block_size(), + error_string = fmt::format( + "Data (Body) could not be read at block {}" + " (byte offset {}) offset in bufferlist returned from read: {}" + " ({} bytes). Returned bufferlist length: {}.", + block_offset, block_offset * m_model.get_block_size(), (block_offset - read_offset), (block_offset - read_offset) * m_model.get_block_size(), bufferlist_length); - } - break; + } break; case ErrorType::UNKNOWN: - [[ fallthrough ]]; - - default: - { - error_string = fmt::format("Data mismatch detected at block {}" - " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}", - block_offset, - block_offset * m_model.get_block_size(), - fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), ""), + [[fallthrough]]; + + default: { + error_string = fmt::format( + "Data mismatch detected at block {}" + " (byte offset {}).\nExpected data:\n{:02x}\nRead data:\n{:02x}", + block_offset, block_offset * m_model.get_block_size(), + fmt::join(generated_bytes, generated_bytes + m_model.get_block_size(), + ""), fmt::join(read_bytes, read_bytes + m_model.get_block_size(), "")); - } - break; + } break; } dout(0) << error_string << dendl; } -void HeaderedSeededRandomGenerator - ::printDebugInformationForRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - ErrorType rangeError, - const bufferlist& bufferlist) -{ - switch(rangeError) - { - case ErrorType::RUN_ID_MISMATCH: - printDebugInformationForRunIdMismatchRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::SEED_MISMATCH: - printDebugInformationForSeedMismatchRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::DATA_MISMATCH: - printDebugInformationDataBodyMismatchRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::DATA_NOT_FOUND: - printDebugInformationDataNotFoundRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; - case ErrorType::UNKNOWN: - [[ fallthrough ]]; - default: - printDebugInformationCorruptRange(read_offset, start_block_offset, - range_length_in_blocks, bufferlist); - break; +void HeaderedSeededRandomGenerator ::printDebugInformationForRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, ErrorType rangeError, + const bufferlist& bufferlist) { + switch (rangeError) { + case ErrorType::RUN_ID_MISMATCH: + printDebugInformationForRunIdMismatchRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::SEED_MISMATCH: + printDebugInformationForSeedMismatchRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::DATA_MISMATCH: + printDebugInformationDataBodyMismatchRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::DATA_NOT_FOUND: + printDebugInformationDataNotFoundRange( + read_offset, start_block_offset, range_length_in_blocks, bufferlist); + break; + case ErrorType::UNKNOWN: + [[fallthrough]]; + default: + printDebugInformationCorruptRange(read_offset, start_block_offset, + range_length_in_blocks, bufferlist); + break; } } -void HeaderedSeededRandomGenerator - ::printDebugInformationForRunIdMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ +void HeaderedSeededRandomGenerator ::printDebugInformationForRunIdMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { uint64_t range_start = start_block_offset; uint64_t range_length = 0; - UniqueIdBytes initial_read_unique_run_id = readUniqueRunId(start_block_offset - read_offset, - bufferlist); + UniqueIdBytes initial_read_unique_run_id = + readUniqueRunId(start_block_offset - read_offset, bufferlist); for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { - ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) - == ErrorType::RUN_ID_MISMATCH); + i < start_block_offset + range_length_in_blocks; i++) { + ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) == + ErrorType::RUN_ID_MISMATCH); - UniqueIdBytes read_unique_run_id = readUniqueRunId(i - read_offset, bufferlist); + UniqueIdBytes read_unique_run_id = + readUniqueRunId(i - read_offset, bufferlist); if (initial_read_unique_run_id != read_unique_run_id || - i == (start_block_offset + range_length_in_blocks - 1)) - { - if (range_length == 1) - { + i == (start_block_offset + range_length_in_blocks - 1)) { + if (range_length == 1) { printDebugInformationForBlock(read_offset, i, bufferlist); - } - else if (range_length > 1) - { - dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {} ({} bytes)" - " and spanning a range of {} blocks ({} bytes). " - "Expected run id {} for range but found id {}" - " for all blocks in range. " - "Block data corrupt or not written from this instance of this application.", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - unique_run_id, - initial_read_unique_run_id) << dendl; + } else if (range_length > 1) { + dout(0) + << fmt::format( + "Data (Run ID) Mismatch detected from block {} ({} bytes)" + " and spanning a range of {} blocks ({} bytes). " + "Expected run id {} for range but found id {}" + " for all blocks in range. " + "Block data corrupt or not written from this instance of " + "this application.", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + unique_run_id, initial_read_unique_run_id) + << dendl; } range_start = i; range_length = 1; initial_read_unique_run_id = read_unique_run_id; - } - else - { + } else { range_length++; } } - if (range_length == 1) - { - printDebugInformationForBlock(read_offset, - start_block_offset + range_length_in_blocks - 1, - bufferlist); - } - else if (range_length > 1) - { - dout(0) << fmt::format("Data (Run ID) Mismatch detected from block {}" - " ({} bytes) and spanning a range of {} blocks ({} bytes). " - "Expected run id {} for range but found id for all blocks in range. " - "Block data corrupt or not written from this instance of this application.", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - unique_run_id, - initial_read_unique_run_id) + if (range_length == 1) { + printDebugInformationForBlock( + read_offset, start_block_offset + range_length_in_blocks - 1, + bufferlist); + } else if (range_length > 1) { + dout(0) << fmt::format( + "Data (Run ID) Mismatch detected from block {}" + " ({} bytes) and spanning a range of {} blocks ({} bytes). " + "Expected run id {} for range but found id for all blocks " + "in range. " + "Block data corrupt or not written from this instance of " + "this application.", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + unique_run_id, initial_read_unique_run_id) << dendl; } } -void HeaderedSeededRandomGenerator - ::printDebugInformationForSeedMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ +void HeaderedSeededRandomGenerator ::printDebugInformationForSeedMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { uint64_t range_start = start_block_offset; uint64_t range_length = 0; // Assert here if needed, as we can't support values // that can't be converted to a signed integer. - ceph_assert(m_model.get_block_size() < (std::numeric_limits<uint64_t>::max() / 2)); + ceph_assert(m_model.get_block_size() < + (std::numeric_limits<uint64_t>::max() / 2)); std::optional<int64_t> range_offset = 0; for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { - ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) - == ErrorType::SEED_MISMATCH); + i < start_block_offset + range_length_in_blocks; i++) { + ceph_assert(getErrorTypeForBlock(read_offset, i, bufferlist) == + ErrorType::SEED_MISMATCH); SeedBytes read_seed = readSeed(i - read_offset, bufferlist); std::vector<int> seed_found_offsets = m_model.get_seed_offsets(read_seed); if ((seed_found_offsets.size() == 1 && - (static_cast<int64_t>(seed_found_offsets.front() - i) == range_offset)) || - range_length == 0) - { - if (range_length == 0) - { + (static_cast<int64_t>(seed_found_offsets.front() - i) == + range_offset)) || + range_length == 0) { + if (range_length == 0) { range_start = i; - if (seed_found_offsets.size() > 0) - { + if (seed_found_offsets.size() > 0) { range_offset = seed_found_offsets.front() - i; - } - else - { + } else { range_offset = std::nullopt; } } range_length++; - } - else - { - if (range_length == 1) - { + } else { + if (range_length == 1) { printDebugInformationForBlock(read_offset, i - 1, bufferlist); - } - else if (range_length > 1 && range_offset.has_value()) - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}" - " ({} bytes) and spanning a range of {} blocks ({} bytes). " - "Returned data located starting from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, range_length * m_model.get_block_size(), - static_cast<uint64_t>(*range_offset) + range_start, - (static_cast<uint64_t>(*range_offset) + range_start) - * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) - << dendl; - } - else - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {}" - " ({} bytes) and spanning a range of {} blocks ({} bytes). " - "Data seed mismatch spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, range_length * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) - << dendl; + } else if (range_length > 1 && range_offset.has_value()) { + dout(0) + << fmt::format( + "Data (Seed) Mismatch detected from block {}" + " ({} bytes) and spanning a range of {} blocks ({} bytes). " + "Returned data located starting from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + static_cast<uint64_t>(*range_offset) + range_start, + (static_cast<uint64_t>(*range_offset) + range_start) * + m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) + << dendl; + } else { + dout(0) + << fmt::format( + "Data (Seed) Mismatch detected from block {}" + " ({} bytes) and spanning a range of {} blocks ({} bytes). " + "Data seed mismatch spanning a range of {} blocks ({} " + "bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) + << dendl; } range_length = 1; range_start = i; - if (seed_found_offsets.size() > 0) - { + if (seed_found_offsets.size() > 0) { range_offset = seed_found_offsets.front() - i; - } - else - { + } else { range_offset = std::nullopt; } } } - if (range_length == 1) - { - printDebugInformationForBlock(read_offset, - start_block_offset + range_length_in_blocks - 1, - bufferlist); - } - else if (range_length > 1 && range_offset.has_value()) - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes). " - "Returned data located starting from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - *range_offset + range_start, - (*range_offset + range_start) * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) + if (range_length == 1) { + printDebugInformationForBlock( + read_offset, start_block_offset + range_length_in_blocks - 1, + bufferlist); + } else if (range_length > 1 && range_offset.has_value()) { + dout(0) << fmt::format( + "Data (Seed) Mismatch detected from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes). " + "Returned data located starting from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + *range_offset + range_start, + (*range_offset + range_start) * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) << dendl; - } - else - { - dout(0) << fmt::format("Data (Seed) Mismatch detected from block {} ({} bytes) " - "and spanning a range of {} blocks ({} bytes). " - "and spanning a range of {} blocks ({} bytes).", - range_start, - range_start * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size(), - range_length, - range_length * m_model.get_block_size()) + } else { + dout(0) << fmt::format( + "Data (Seed) Mismatch detected from block {} ({} bytes) " + "and spanning a range of {} blocks ({} bytes). " + "and spanning a range of {} blocks ({} bytes).", + range_start, range_start * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size(), + range_length, range_length * m_model.get_block_size()) << dendl; } } -void HeaderedSeededRandomGenerator -::printDebugInformationDataBodyMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ - dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. " - "Headers look as expected for range, " - "but generated data body does not match. " - "More information given for individual blocks below.", - start_block_offset, - start_block_offset + range_length_in_blocks - 1) +void HeaderedSeededRandomGenerator ::printDebugInformationDataBodyMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { + dout(0) << fmt::format( + "Data Mismatch detected in blocks from {} to {}. " + "Headers look as expected for range, " + "but generated data body does not match. " + "More information given for individual blocks below.", + start_block_offset, + start_block_offset + range_length_in_blocks - 1) << dendl; for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { + i < start_block_offset + range_length_in_blocks; i++) { printDebugInformationForBlock(read_offset, i, bufferlist); } } -void HeaderedSeededRandomGenerator - ::printDebugInformationCorruptRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ - dout(0) << fmt::format("Data Mismatch detected in blocks from {} to {}. " - "Headers look as expected for range, " - "but generated data body does not match. " - "More information given for individual blocks below.", - start_block_offset, - start_block_offset + range_length_in_blocks - 1) +void HeaderedSeededRandomGenerator ::printDebugInformationCorruptRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { + dout(0) << fmt::format( + "Data Mismatch detected in blocks from {} to {}. " + "Headers look as expected for range, " + "but generated data body does not match. " + "More information given for individual blocks below.", + start_block_offset, + start_block_offset + range_length_in_blocks - 1) << dendl; for (uint64_t i = start_block_offset; - i < start_block_offset + range_length_in_blocks; i++) - { + i < start_block_offset + range_length_in_blocks; i++) { printDebugInformationForBlock(read_offset, i, bufferlist); } } -void HeaderedSeededRandomGenerator - ::printDebugInformationDataNotFoundRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist) -{ - dout(0) << fmt::format("Data not found for blocks from {} to {}. " - "More information given for individual blocks below.", - start_block_offset, - start_block_offset + range_length_in_blocks - 1) +void HeaderedSeededRandomGenerator ::printDebugInformationDataNotFoundRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist) { + dout(0) << fmt::format( + "Data not found for blocks from {} to {}. " + "More information given for individual blocks below.", + start_block_offset, + start_block_offset + range_length_in_blocks - 1) << dendl; - for (uint64_t i = start_block_offset; i < start_block_offset + range_length_in_blocks; i++) - { + for (uint64_t i = start_block_offset; + i < start_block_offset + range_length_in_blocks; i++) { printDebugInformationForBlock(read_offset, i, bufferlist); } } -void HeaderedSeededRandomGenerator - ::printDebugInformationForOffsets(uint64_t read_offset, - std::vector<uint64_t> offsets, - const bufferlist& bufferlist) -{ +void HeaderedSeededRandomGenerator ::printDebugInformationForOffsets( + uint64_t read_offset, std::vector<uint64_t> offsets, + const bufferlist& bufferlist) { uint64_t range_start = 0; uint64_t range_length = 0; ErrorType rangeError = ErrorType::UNKNOWN; - for (const uint64_t& block_offset : offsets) - { - ErrorType blockError = getErrorTypeForBlock(read_offset, block_offset, - bufferlist); + for (const uint64_t& block_offset : offsets) { + ErrorType blockError = + getErrorTypeForBlock(read_offset, block_offset, bufferlist); - if (range_start == 0 && range_length == 0) - { + if (range_start == 0 && range_length == 0) { range_start = block_offset; range_length = 1; rangeError = blockError; - } - else if (blockError == rangeError && - range_start + range_length == block_offset) -{ + } else if (blockError == rangeError && + range_start + range_length == block_offset) { range_length++; - } - else - { - if (range_length == 1) - { + } else { + if (range_length == 1) { printDebugInformationForBlock(read_offset, range_start, bufferlist); - } - else if (range_length > 1) - { + } else if (range_length > 1) { printDebugInformationForRange(read_offset, range_start, range_length, rangeError, bufferlist); } @@ -741,12 +644,9 @@ void HeaderedSeededRandomGenerator } } - if (range_length == 1) - { + if (range_length == 1) { printDebugInformationForBlock(read_offset, range_start, bufferlist); - } - else if (range_length > 1) - { + } else if (range_length > 1) { printDebugInformationForRange(read_offset, range_start, range_length, rangeError, bufferlist); } diff --git a/src/common/io_exerciser/DataGenerator.h b/src/common/io_exerciser/DataGenerator.h index 1e5784a54cc..c497c78ed61 100644 --- a/src/common/io_exerciser/DataGenerator.h +++ b/src/common/io_exerciser/DataGenerator.h @@ -3,8 +3,8 @@ #include <memory> #include <random> -#include "include/buffer.h" #include "ObjectModel.h" +#include "include/buffer.h" /* Overview * @@ -23,149 +23,139 @@ * * class HeaderedSeededRandomGenerator * Inherits from SeededDataGenerator. Generates entirely random patterns - * based on the seed retrieved by the model, however also appends a + * based on the seed retrieved by the model, however also appends a * header to the start of each block. This generator also provides * a range of verbose debug options to help disagnose a miscompare * whenever it detects unexpected data. */ namespace ceph { - namespace io_exerciser { - namespace data_generation { - enum class GenerationType { - SeededRandom, - HeaderedSeededRandom - // CompressedGenerator - // MixedGenerator - }; - - class DataGenerator { - public: - virtual ~DataGenerator() = default; - static std::unique_ptr<DataGenerator> - create_generator(GenerationType generatorType, - const ObjectModel& model); - virtual bufferlist generate_data(uint64_t length, uint64_t offset)=0; - virtual bool validate(bufferlist& bufferlist, uint64_t offset, - uint64_t length); - - // Used for testing debug outputs from data generation - virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length); - - protected: - const ObjectModel& m_model; - - DataGenerator(const ObjectModel& model) : m_model(model) {} - }; - - class SeededRandomGenerator : public DataGenerator - { - public: - SeededRandomGenerator(const ObjectModel& model) - : DataGenerator(model) {} - - virtual bufferptr generate_block(uint64_t offset); - virtual bufferlist generate_data(uint64_t length, uint64_t offset); - virtual bufferptr generate_wrong_block(uint64_t offset); - virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length) override; - }; - - class HeaderedSeededRandomGenerator : public SeededRandomGenerator - { - public: - HeaderedSeededRandomGenerator(const ObjectModel& model, - std::optional<uint64_t> unique_run_id = std::nullopt); - - bufferptr generate_block(uint64_t offset) override; - bufferptr generate_wrong_block(uint64_t offset) override; - bool validate(bufferlist& bufferlist, uint64_t offset, - uint64_t length) override; - - private: - using UniqueIdBytes = uint64_t; - using SeedBytes = int; - using TimeBytes = uint64_t; - - enum class ErrorType { - RUN_ID_MISMATCH, - SEED_MISMATCH, - DATA_MISMATCH, - DATA_NOT_FOUND, - UNKNOWN - }; - - constexpr uint8_t headerStart() const - { return 0; }; - constexpr uint8_t uniqueIdStart() const - { return headerStart(); }; - constexpr uint8_t uniqueIdLength() const - { return sizeof(UniqueIdBytes); }; - constexpr uint8_t seedStart() const - { return uniqueIdStart() + uniqueIdLength(); }; - constexpr uint8_t seedLength() const - { return sizeof(SeedBytes); }; - constexpr uint8_t timeStart() const - { return seedStart() + seedLength(); }; - constexpr uint8_t timeLength() const - { return sizeof(TimeBytes); }; - constexpr uint8_t timeEnd() const - { return timeStart() + timeLength(); }; - constexpr uint8_t headerLength() const - { return uniqueIdLength() + seedLength() + timeLength(); }; - constexpr uint8_t bodyStart() const - { return headerStart() + headerLength(); }; - - const UniqueIdBytes readUniqueRunId(uint64_t block_offset, - const bufferlist& bufferlist); - const SeedBytes readSeed(uint64_t block_offset, - const bufferlist& bufferlist); - const TimeBytes readDateTime(uint64_t block_offset, +namespace io_exerciser { +namespace data_generation { +enum class GenerationType { + SeededRandom, + HeaderedSeededRandom + // CompressedGenerator + // MixedGenerator +}; + +class DataGenerator { + public: + virtual ~DataGenerator() = default; + static std::unique_ptr<DataGenerator> create_generator( + GenerationType generatorType, const ObjectModel& model); + virtual bufferlist generate_data(uint64_t length, uint64_t offset) = 0; + virtual bool validate(bufferlist& bufferlist, uint64_t offset, + uint64_t length); + + // Used for testing debug outputs from data generation + virtual bufferlist generate_wrong_data(uint64_t offset, uint64_t length); + + protected: + const ObjectModel& m_model; + + DataGenerator(const ObjectModel& model) : m_model(model) {} +}; + +class SeededRandomGenerator : public DataGenerator { + public: + SeededRandomGenerator(const ObjectModel& model) : DataGenerator(model) {} + + virtual bufferptr generate_block(uint64_t offset); + bufferlist generate_data(uint64_t length, uint64_t offset) override; + virtual bufferptr generate_wrong_block(uint64_t offset); + bufferlist generate_wrong_data(uint64_t offset, + uint64_t length) override; +}; + +class HeaderedSeededRandomGenerator : public SeededRandomGenerator { + public: + HeaderedSeededRandomGenerator( + const ObjectModel& model, + std::optional<uint64_t> unique_run_id = std::nullopt); + + bufferptr generate_block(uint64_t offset) override; + bufferptr generate_wrong_block(uint64_t offset) override; + bool validate(bufferlist& bufferlist, uint64_t offset, + uint64_t length) override; + + private: + using UniqueIdBytes = uint64_t; + using SeedBytes = int; + using TimeBytes = uint64_t; + + enum class ErrorType { + RUN_ID_MISMATCH, + SEED_MISMATCH, + DATA_MISMATCH, + DATA_NOT_FOUND, + UNKNOWN + }; + + constexpr uint8_t headerStart() const { return 0; }; + constexpr uint8_t uniqueIdStart() const { return headerStart(); }; + constexpr uint8_t uniqueIdLength() const { return sizeof(UniqueIdBytes); }; + constexpr uint8_t seedStart() const { + return uniqueIdStart() + uniqueIdLength(); + }; + constexpr uint8_t seedLength() const { return sizeof(SeedBytes); }; + constexpr uint8_t timeStart() const { return seedStart() + seedLength(); }; + constexpr uint8_t timeLength() const { return sizeof(TimeBytes); }; + constexpr uint8_t timeEnd() const { return timeStart() + timeLength(); }; + constexpr uint8_t headerLength() const { + return uniqueIdLength() + seedLength() + timeLength(); + }; + constexpr uint8_t bodyStart() const { + return headerStart() + headerLength(); + }; + + const UniqueIdBytes readUniqueRunId(uint64_t block_offset, + const bufferlist& bufferlist); + const SeedBytes readSeed(uint64_t block_offset, const bufferlist& bufferlist); + const TimeBytes readDateTime(uint64_t block_offset, + const bufferlist& bufferlist); + + const UniqueIdBytes unique_run_id; + + uint64_t generate_unique_run_id(); + + bool validate_block(uint64_t block_offset, const char* buffer_start); + + const ErrorType getErrorTypeForBlock(uint64_t read_offset, + uint64_t block_offset, const bufferlist& bufferlist); - const UniqueIdBytes unique_run_id; - - uint64_t generate_unique_run_id(); - - bool validate_block(uint64_t block_offset, const char* buffer_start); - - const ErrorType getErrorTypeForBlock(uint64_t read_offset, - uint64_t block_offset, - const bufferlist& bufferlist); - - void printDebugInformationForBlock(uint64_t read_offset, - uint64_t block_offset, - const bufferlist& bufferlist); - void printDebugInformationForRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - ErrorType rangeError, - const bufferlist& bufferlist); - - void printDebugInformationForRunIdMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationForSeedMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationDataBodyMismatchRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationDataNotFoundRange(uint64_t ßread_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - void printDebugInformationCorruptRange(uint64_t read_offset, - uint64_t start_block_offset, - uint64_t range_length_in_blocks, - const bufferlist& bufferlist); - - void printDebugInformationForOffsets(uint64_t read_offset, - std::vector<uint64_t> offsets, - const bufferlist& bufferlist); - }; - } - } -} + void printDebugInformationForBlock(uint64_t read_offset, + uint64_t block_offset, + const bufferlist& bufferlist); + void printDebugInformationForRange(uint64_t read_offset, + uint64_t start_block_offset, + uint64_t range_length_in_blocks, + ErrorType rangeError, + const bufferlist& bufferlist); + + void printDebugInformationForRunIdMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist); + void printDebugInformationForSeedMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist); + void printDebugInformationDataBodyMismatchRange( + uint64_t read_offset, uint64_t start_block_offset, + uint64_t range_length_in_blocks, const bufferlist& bufferlist); + void printDebugInformationDataNotFoundRange(uint64_t ßread_offset, + uint64_t start_block_offset, + uint64_t range_length_in_blocks, + const bufferlist& bufferlist); + void printDebugInformationCorruptRange(uint64_t read_offset, + uint64_t start_block_offset, + uint64_t range_length_in_blocks, + const bufferlist& bufferlist); + + void printDebugInformationForOffsets(uint64_t read_offset, + std::vector<uint64_t> offsets, + const bufferlist& bufferlist); +}; +} // namespace data_generation +} // namespace io_exerciser +} // namespace ceph diff --git a/src/common/io_exerciser/EcIoSequence.cc b/src/common/io_exerciser/EcIoSequence.cc new file mode 100644 index 00000000000..611920c96e0 --- /dev/null +++ b/src/common/io_exerciser/EcIoSequence.cc @@ -0,0 +1,267 @@ +#include "EcIoSequence.h" + +#include <memory> + +using IoOp = ceph::io_exerciser::IoOp; +using Sequence = ceph::io_exerciser::Sequence; +using IoSequence = ceph::io_exerciser::IoSequence; +using EcIoSequence = ceph::io_exerciser::EcIoSequence; +using ReadInjectSequence = ceph::io_exerciser::ReadInjectSequence; + +bool EcIoSequence::is_supported(Sequence sequence) const { return true; } + +std::unique_ptr<IoSequence> EcIoSequence::generate_sequence( + Sequence sequence, std::pair<int, int> obj_size_range, int k, int m, + int seed) { + switch (sequence) { + case Sequence::SEQUENCE_SEQ0: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ1: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ2: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ3: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ4: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ5: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ6: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ7: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ8: + [[fallthrough]]; + case Sequence::SEQUENCE_SEQ9: + return std::make_unique<ReadInjectSequence>(obj_size_range, seed, + sequence, k, m); + case Sequence::SEQUENCE_SEQ10: + return std::make_unique<Seq10>(obj_size_range, seed, k, m); + default: + ceph_abort_msg("Unrecognised sequence"); + } +} + +EcIoSequence::EcIoSequence(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), + setup_inject(false), + clear_inject(false), + shard_to_inject(std::nullopt) {} + +void EcIoSequence::select_random_data_shard_to_inject_read_error(int k, int m) { + shard_to_inject = rng(k - 1); + setup_inject = true; +} + +void EcIoSequence::select_random_data_shard_to_inject_write_error(int k, + int m) { + // Write errors do not support injecting to the primary OSD + shard_to_inject = rng(1, k - 1); + setup_inject = true; +} + +void EcIoSequence::select_random_shard_to_inject_read_error(int k, int m) { + shard_to_inject = rng(k + m - 1); + setup_inject = true; +} + +void EcIoSequence::select_random_shard_to_inject_write_error(int k, int m) { + // Write errors do not support injecting to the primary OSD + shard_to_inject = rng(1, k + m - 1); + setup_inject = true; +} + +void EcIoSequence::generate_random_read_inject_type() { + inject_op_type = static_cast<InjectOpType>( + rng(static_cast<int>(InjectOpType::ReadEIO), + static_cast<int>(InjectOpType::ReadMissingShard))); +} + +void EcIoSequence::generate_random_write_inject_type() { + inject_op_type = static_cast<InjectOpType>( + rng(static_cast<int>(InjectOpType::WriteFailAndRollback), + static_cast<int>(InjectOpType::WriteOSDAbort))); +} + +ceph::io_exerciser::ReadInjectSequence::ReadInjectSequence( + std::pair<int, int> obj_size_range, int seed, Sequence s, int k, int m) + : EcIoSequence(obj_size_range, seed) { + child_sequence = IoSequence::generate_sequence(s, obj_size_range, seed); + select_random_data_shard_to_inject_read_error(k, m); + generate_random_read_inject_type(); +} + +Sequence ceph::io_exerciser::ReadInjectSequence::get_id() const { + return child_sequence->get_id(); +} + +std::string ceph::io_exerciser::ReadInjectSequence::get_name() const { + return child_sequence->get_name() + + " running with read errors injected on shard " + + std::to_string(*shard_to_inject); +} + +std::unique_ptr<IoOp> ReadInjectSequence::next() { + step++; + + if (nextOp) { + std::unique_ptr<IoOp> retOp = nullptr; + nextOp.swap(retOp); + return retOp; + } + + std::unique_ptr<IoOp> childOp = child_sequence->next(); + + switch (childOp->getOpType()) { + case OpType::Remove: + nextOp.swap(childOp); + switch (inject_op_type) { + case InjectOpType::ReadEIO: + return ClearReadErrorInjectOp::generate(*shard_to_inject, 0); + case InjectOpType::ReadMissingShard: + return ClearReadErrorInjectOp::generate(*shard_to_inject, 1); + case InjectOpType::WriteFailAndRollback: + return ClearWriteErrorInjectOp::generate(*shard_to_inject, 0); + case InjectOpType::WriteOSDAbort: + return ClearWriteErrorInjectOp::generate(*shard_to_inject, 3); + case InjectOpType::None: + [[fallthrough]]; + default: + ceph_abort_msg("Unsupported operation"); + } + break; + case OpType::Create: + switch (inject_op_type) { + case InjectOpType::ReadEIO: + nextOp = InjectReadErrorOp::generate( + *shard_to_inject, 0, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::ReadMissingShard: + nextOp = InjectReadErrorOp::generate( + *shard_to_inject, 1, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::WriteFailAndRollback: + nextOp = InjectWriteErrorOp::generate( + *shard_to_inject, 0, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::WriteOSDAbort: + nextOp = InjectWriteErrorOp::generate( + *shard_to_inject, 3, 0, std::numeric_limits<uint64_t>::max()); + break; + case InjectOpType::None: + [[fallthrough]]; + default: + ceph_abort_msg("Unsupported operation"); + } + break; + default: + // Do nothing in default case + break; + } + + return childOp; +} + +std::unique_ptr<ceph::io_exerciser::IoOp> +ceph::io_exerciser::ReadInjectSequence::_next() { + ceph_abort_msg( + "Should not reach this point, " + "this sequence should only consume complete sequences"); + + return DoneOp::generate(); +} + +ceph::io_exerciser::Seq10::Seq10(std::pair<int, int> obj_size_range, int seed, + int k, int m) + : EcIoSequence(obj_size_range, seed), + offset(0), + length(1), + inject_error_done(false), + failed_write_done(false), + read_done(false), + successful_write_done(false), + test_all_lengths(false), // Only test length(1) due to time constraints + test_all_sizes( + false) // Only test obj_size(rand()) due to time constraints +{ + select_random_shard_to_inject_write_error(k, m); + // We will inject specifically as part of our sequence in this sequence + setup_inject = false; + if (!test_all_sizes) { + select_random_object_size(); + } +} + +Sequence ceph::io_exerciser::Seq10::get_id() const { + return Sequence::SEQUENCE_SEQ10; +} + +std::string ceph::io_exerciser::Seq10::get_name() const { + return "Sequential writes of length " + std::to_string(length) + + " with queue depth 1" + " first injecting a failed write and read it to ensure it rolls back, " + "then" + " successfully writing the data and reading the write the ensure it " + "is applied"; +} + +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq10::_next() { + if (!inject_error_done) { + inject_error_done = true; + return InjectWriteErrorOp::generate(*shard_to_inject, 0, 0, + std::numeric_limits<uint64_t>::max()); + } else if (!failed_write_done) { + failed_write_done = true; + read_done = false; + barrier = true; + return SingleFailedWriteOp::generate(offset, length); + } else if (failed_write_done && !read_done) { + read_done = true; + barrier = true; + return SingleReadOp::generate(offset, length); + } else if (!clear_inject_done) { + clear_inject_done = true; + return ClearWriteErrorInjectOp::generate(*shard_to_inject, 0); + } else if (!successful_write_done) { + successful_write_done = true; + read_done = false; + barrier = true; + return SingleWriteOp::generate(offset, length); + } else if (successful_write_done && !read_done) { + read_done = true; + return SingleReadOp::generate(offset, length); + } else if (successful_write_done && read_done) { + offset++; + inject_error_done = false; + failed_write_done = false; + read_done = false; + clear_inject_done = false; + successful_write_done = false; + + if (offset + length >= obj_size) { + if (!test_all_lengths) { + remove = true; + done = true; + return BarrierOp::generate(); + } + + offset = 0; + length++; + if (length > obj_size) { + if (!test_all_sizes) { + remove = true; + done = true; + return BarrierOp::generate(); + } + + length = 1; + return increment_object_size(); + } + } + + return BarrierOp::generate(); + } else { + ceph_abort_msg("Sequence in undefined state. Aborting"); + return DoneOp::generate(); + } +}
\ No newline at end of file diff --git a/src/common/io_exerciser/EcIoSequence.h b/src/common/io_exerciser/EcIoSequence.h new file mode 100644 index 00000000000..37283b3906b --- /dev/null +++ b/src/common/io_exerciser/EcIoSequence.h @@ -0,0 +1,65 @@ +#include "IoSequence.h" + +namespace ceph { +namespace io_exerciser { +class EcIoSequence : public IoSequence { + public: + virtual bool is_supported(Sequence sequence) const override; + static std::unique_ptr<IoSequence> generate_sequence( + Sequence s, std::pair<int, int> obj_size_range, int k, int m, int seed); + + protected: + bool setup_inject; + bool clear_inject; + std::optional<uint64_t> shard_to_inject; + InjectOpType inject_op_type; + + EcIoSequence(std::pair<int, int> obj_size_range, int seed); + + // Writes cannot be sent to injected on shard zero, so selections seperated + // out + void select_random_data_shard_to_inject_read_error(int k, int m); + void select_random_data_shard_to_inject_write_error(int k, int m); + void select_random_shard_to_inject_read_error(int k, int m); + void select_random_shard_to_inject_write_error(int k, int m); + void generate_random_read_inject_type(); + void generate_random_write_inject_type(); +}; + +class ReadInjectSequence : public EcIoSequence { + public: + ReadInjectSequence(std::pair<int, int> obj_size_range, int seed, Sequence s, + int k, int m); + + Sequence get_id() const override; + std::string get_name() const override; + virtual std::unique_ptr<IoOp> next() override; + std::unique_ptr<IoOp> _next() override; + + private: + std::unique_ptr<IoSequence> child_sequence; + std::unique_ptr<IoOp> nextOp; +}; + +class Seq10 : public EcIoSequence { + public: + Seq10(std::pair<int, int> obj_size_range, int seed, int k, int m); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; + + bool inject_error_done; + bool failed_write_done; + bool read_done; + bool clear_inject_done; + bool successful_write_done; + bool test_all_lengths; + bool test_all_sizes; +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/IoOp.cc b/src/common/io_exerciser/IoOp.cc index cd855ba6fff..493d1f435b4 100644 --- a/src/common/io_exerciser/IoOp.cc +++ b/src/common/io_exerciser/IoOp.cc @@ -1,188 +1,316 @@ #include "IoOp.h" -using IoOp = ceph::io_exerciser::IoOp; +#include "fmt/format.h" +#include "include/ceph_assert.h" -IoOp::IoOp( OpType op, - uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3) : - op(op), - offset1(offset1), length1(length1), - offset2(offset2), length2(length2), - offset3(offset3), length3(length3) -{ +using IoOp = ceph::io_exerciser::IoOp; +using OpType = ceph::io_exerciser::OpType; -} +using DoneOp = ceph::io_exerciser::DoneOp; +using BarrierOp = ceph::io_exerciser::BarrierOp; +using CreateOp = ceph::io_exerciser::CreateOp; +using RemoveOp = ceph::io_exerciser::RemoveOp; +using SingleReadOp = ceph::io_exerciser::SingleReadOp; +using DoubleReadOp = ceph::io_exerciser::DoubleReadOp; +using TripleReadOp = ceph::io_exerciser::TripleReadOp; +using SingleWriteOp = ceph::io_exerciser::SingleWriteOp; +using DoubleWriteOp = ceph::io_exerciser::DoubleWriteOp; +using TripleWriteOp = ceph::io_exerciser::TripleWriteOp; +using SingleFailedWriteOp = ceph::io_exerciser::SingleFailedWriteOp; +using DoubleFailedWriteOp = ceph::io_exerciser::DoubleFailedWriteOp; +using TripleFailedWriteOp = ceph::io_exerciser::TripleFailedWriteOp; -std::string IoOp::value_to_string(uint64_t v) const -{ +namespace { +std::string value_to_string(uint64_t v) { if (v < 1024 || (v % 1024) != 0) { return std::to_string(v); - }else if (v < 1024*1024 || (v % (1024 * 1024)) != 0 ) { + } else if (v < 1024 * 1024 || (v % (1024 * 1024)) != 0) { return std::to_string(v / 1024) + "K"; - }else{ + } else { return std::to_string(v / 1024 / 1024) + "M"; } } +} // namespace -std::unique_ptr<IoOp> IoOp - ::generate_done() { +IoOp::IoOp() {} - return std::make_unique<IoOp>(OpType::Done); -} +template <OpType opType> +ceph::io_exerciser::TestOp<opType>::TestOp() : IoOp() {} + +DoneOp::DoneOp() : TestOp<OpType::Done>() {} -std::unique_ptr<IoOp> IoOp - ::generate_barrier() { +std::string DoneOp::to_string(uint64_t block_size) const { return "Done"; } - return std::make_unique<IoOp>(OpType::BARRIER); +std::unique_ptr<DoneOp> DoneOp::generate() { + return std::make_unique<DoneOp>(); } -std::unique_ptr<IoOp> IoOp - ::generate_create(uint64_t size) { +BarrierOp::BarrierOp() : TestOp<OpType::Barrier>() {} - return std::make_unique<IoOp>(OpType::CREATE,0,size); +std::unique_ptr<BarrierOp> BarrierOp::generate() { + return std::make_unique<BarrierOp>(); } -std::unique_ptr<IoOp> IoOp - ::generate_remove() { - - return std::make_unique<IoOp>(OpType::REMOVE); +std::string BarrierOp::to_string(uint64_t block_size) const { + return "Barrier"; } -std::unique_ptr<IoOp> IoOp - ::generate_read(uint64_t offset, uint64_t length) { +CreateOp::CreateOp(uint64_t size) : TestOp<OpType::Create>(), size(size) {} - return std::make_unique<IoOp>(OpType::READ, offset, length); +std::unique_ptr<CreateOp> CreateOp::generate(uint64_t size) { + return std::make_unique<CreateOp>(size); } -std::unique_ptr<IoOp> IoOp - ::generate_read2(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2) { +std::string CreateOp::to_string(uint64_t block_size) const { + return "Create (size=" + value_to_string(size * block_size) + ")"; +} - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); - } +RemoveOp::RemoveOp() : TestOp<OpType::Remove>() {} - return std::make_unique<IoOp>(OpType::READ2, - offset1, length1, - offset2, length2); +std::unique_ptr<RemoveOp> RemoveOp::generate() { + return std::make_unique<RemoveOp>(); } -std::unique_ptr<IoOp> IoOp - ::generate_read3(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3) { +std::string RemoveOp::to_string(uint64_t block_size) const { return "Remove"; } - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); +template <OpType opType, int numIOs> +ceph::io_exerciser::ReadWriteOp<opType, numIOs>::ReadWriteOp( + std::array<uint64_t, numIOs>&& offset, + std::array<uint64_t, numIOs>&& length) + : TestOp<opType>(), offset(offset), length(length) { + auto compare = [](uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2) { + if (offset1 < offset2) { + ceph_assert(offset1 + length1 <= offset2); + } else { + ceph_assert(offset2 + length2 <= offset1); + } + }; + + if (numIOs > 1) { + for (int i = 0; i < numIOs - 1; i++) { + for (int j = i + 1; j < numIOs; j++) { + compare(offset[i], length[i], offset[j], length[j]); + } + } } - if (offset1 < offset3) { - ceph_assert( offset1 + length1 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset1 ); +} + +template <OpType opType, int numIOs> +std::string ceph::io_exerciser::ReadWriteOp<opType, numIOs>::to_string( + uint64_t block_size) const { + std::string offset_length_desc; + if (numIOs > 0) { + offset_length_desc += fmt::format( + "offset1={}", value_to_string(this->offset[0] * block_size)); + offset_length_desc += fmt::format( + ",length1={}", value_to_string(this->length[0] * block_size)); + for (int i = 1; i < numIOs; i++) { + offset_length_desc += fmt::format( + ",offset{}={}", i + 1, value_to_string(this->offset[i] * block_size)); + offset_length_desc += fmt::format( + ",length{}={}", i + 1, value_to_string(this->length[i] * block_size)); + } } - if (offset2 < offset3) { - ceph_assert( offset2 + length2 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset2 ); + switch (opType) { + case OpType::Read: + [[fallthrough]]; + case OpType::Read2: + [[fallthrough]]; + case OpType::Read3: + return fmt::format("Read{} ({})", numIOs, offset_length_desc); + case OpType::Write: + [[fallthrough]]; + case OpType::Write2: + [[fallthrough]]; + case OpType::Write3: + return fmt::format("Write{} ({})", numIOs, offset_length_desc); + case OpType::FailedWrite: + [[fallthrough]]; + case OpType::FailedWrite2: + [[fallthrough]]; + case OpType::FailedWrite3: + return fmt::format("FailedWrite{} ({})", numIOs, offset_length_desc); + default: + ceph_abort_msg( + fmt::format("Unsupported op type by ReadWriteOp ({})", opType)); } - return std::make_unique<IoOp>(OpType::READ3, - offset1, length1, - offset2, length2, - offset3, length3); } -std::unique_ptr<IoOp> IoOp::generate_write(uint64_t offset, uint64_t length) { - return std::make_unique<IoOp>(OpType::WRITE, offset, length); +SingleReadOp::SingleReadOp(uint64_t offset, uint64_t length) + : ReadWriteOp<OpType::Read, 1>({offset}, {length}) {} + +std::unique_ptr<SingleReadOp> SingleReadOp::generate(uint64_t offset, + uint64_t length) { + return std::make_unique<SingleReadOp>(offset, length); } -std::unique_ptr<IoOp> IoOp::generate_write2(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2) { - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); - } - return std::make_unique<IoOp>(OpType::WRITE2, - offset1, length1, - offset2, length2); +DoubleReadOp::DoubleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2) + : ReadWriteOp<OpType::Read2, 2>({offset1, offset2}, {length1, length2}) {} + +std::unique_ptr<DoubleReadOp> DoubleReadOp::generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2) { + return std::make_unique<DoubleReadOp>(offset1, length1, offset2, length2); } -std::unique_ptr<IoOp> IoOp::generate_write3(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3) { - if (offset1 < offset2) { - ceph_assert( offset1 + length1 <= offset2 ); - } else { - ceph_assert( offset2 + length2 <= offset1 ); - } - if (offset1 < offset3) { - ceph_assert( offset1 + length1 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset1 ); - } - if (offset2 < offset3) { - ceph_assert( offset2 + length2 <= offset3 ); - } else { - ceph_assert( offset3 + length3 <= offset2 ); - } - return std::make_unique<IoOp>(OpType::WRITE3, - offset1, length1, - offset2, length2, - offset3, length3); -} - -bool IoOp::done() { - return (op == OpType::Done); -} - -std::string IoOp::to_string(uint64_t block_size) const -{ - switch (op) { - case OpType::Done: - return "Done"; - case OpType::BARRIER: - return "Barrier"; - case OpType::CREATE: - return "Create (size=" + value_to_string(length1 * block_size) + ")"; - case OpType::REMOVE: - return "Remove"; - case OpType::READ: - return "Read (offset=" + value_to_string(offset1 * block_size) + - ",length=" + value_to_string(length1 * block_size) + ")"; - case OpType::READ2: - return "Read2 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + ")"; - case OpType::READ3: - return "Read3 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + - ",offset3=" + value_to_string(offset3 * block_size) + - ",length3=" + value_to_string(length3 * block_size) + ")"; - case OpType::WRITE: - return "Write (offset=" + value_to_string(offset1 * block_size) + - ",length=" + value_to_string(length1 * block_size) + ")"; - case OpType::WRITE2: - return "Write2 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + ")"; - case OpType::WRITE3: - return "Write3 (offset1=" + value_to_string(offset1 * block_size) + - ",length1=" + value_to_string(length1 * block_size) + - ",offset2=" + value_to_string(offset2 * block_size) + - ",length2=" + value_to_string(length2 * block_size) + - ",offset3=" + value_to_string(offset3 * block_size) + - ",length3=" + value_to_string(length3 * block_size) + ")"; - default: - break; - } - return "Unknown"; +TripleReadOp::TripleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3) + : ReadWriteOp<OpType::Read3, 3>({offset1, offset2, offset3}, + {length1, length2, length3}) {} + +std::unique_ptr<TripleReadOp> TripleReadOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) { + return std::make_unique<TripleReadOp>(offset1, length1, offset2, length2, + offset3, length3); +} + +SingleWriteOp::SingleWriteOp(uint64_t offset, uint64_t length) + : ReadWriteOp<OpType::Write, 1>({offset}, {length}) {} + +std::unique_ptr<SingleWriteOp> SingleWriteOp::generate(uint64_t offset, + uint64_t length) { + return std::make_unique<SingleWriteOp>(offset, length); +} + +DoubleWriteOp::DoubleWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2) + : ReadWriteOp<OpType::Write2, 2>({offset1, offset2}, {length1, length2}) {} + +std::unique_ptr<DoubleWriteOp> DoubleWriteOp::generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2) { + return std::make_unique<DoubleWriteOp>(offset1, length1, offset2, length2); +} + +TripleWriteOp::TripleWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) + : ReadWriteOp<OpType::Write3, 3>({offset1, offset2, offset3}, + {length1, length2, length3}) {} + +std::unique_ptr<TripleWriteOp> TripleWriteOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) { + return std::make_unique<TripleWriteOp>(offset1, length1, offset2, length2, + offset3, length3); +} + +SingleFailedWriteOp::SingleFailedWriteOp(uint64_t offset, uint64_t length) + : ReadWriteOp<OpType::FailedWrite, 1>({offset}, {length}) {} + +std::unique_ptr<SingleFailedWriteOp> SingleFailedWriteOp::generate( + uint64_t offset, uint64_t length) { + return std::make_unique<SingleFailedWriteOp>(offset, length); +} + +DoubleFailedWriteOp::DoubleFailedWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2) + : ReadWriteOp<OpType::FailedWrite2, 2>({offset1, offset2}, + {length1, length2}) {} + +std::unique_ptr<DoubleFailedWriteOp> DoubleFailedWriteOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2) { + return std::make_unique<DoubleFailedWriteOp>(offset1, length1, offset2, + length2); +} + +TripleFailedWriteOp::TripleFailedWriteOp(uint64_t offset1, uint64_t length1, + uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) + : ReadWriteOp<OpType::FailedWrite3, 3>({offset1, offset2, offset3}, + {length1, length2, length3}) {} + +std::unique_ptr<TripleFailedWriteOp> TripleFailedWriteOp::generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3) { + return std::make_unique<TripleFailedWriteOp>(offset1, length1, offset2, + length2, offset3, length3); +} + +template <ceph::io_exerciser::OpType opType> +ceph::io_exerciser::InjectErrorOp<opType>::InjectErrorOp( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) + : TestOp<opType>(), + shard(shard), + type(type), + when(when), + duration(duration) {} + +template <ceph::io_exerciser::OpType opType> +std::string ceph::io_exerciser::InjectErrorOp<opType>::to_string( + uint64_t blocksize) const { + std::string_view inject_type = get_inject_type_string(); + return fmt::format( + "Inject {} error on shard {} of type {}" + " after {} successful inject(s) lasting {} inject(s)", + inject_type, shard, type.value_or(0), when.value_or(0), + duration.value_or(1)); +} + +ceph::io_exerciser::InjectReadErrorOp::InjectReadErrorOp( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) + : InjectErrorOp<OpType::InjectReadError>(shard, type, when, duration) {} + +std::unique_ptr<ceph::io_exerciser::InjectReadErrorOp> +ceph::io_exerciser ::InjectReadErrorOp::generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) { + return std::make_unique<InjectReadErrorOp>(shard, type, when, duration); +} + +ceph::io_exerciser::InjectWriteErrorOp::InjectWriteErrorOp( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) + : InjectErrorOp<OpType::InjectWriteError>(shard, type, when, duration) {} + +std::unique_ptr<ceph::io_exerciser::InjectWriteErrorOp> +ceph::io_exerciser ::InjectWriteErrorOp::generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration) { + return std::make_unique<InjectWriteErrorOp>(shard, type, when, duration); +} + +template <ceph::io_exerciser::OpType opType> +ceph::io_exerciser::ClearErrorInjectOp<opType>::ClearErrorInjectOp( + int shard, const std::optional<uint64_t>& type) + : TestOp<opType>(), shard(shard), type(type) {} + +template <ceph::io_exerciser::OpType opType> +std::string ceph::io_exerciser::ClearErrorInjectOp<opType>::to_string( + uint64_t blocksize) const { + std::string_view inject_type = get_inject_type_string(); + return fmt::format("Clear {} injects on shard {} of type {}", inject_type, + shard, type.value_or(0)); +} + +ceph::io_exerciser::ClearReadErrorInjectOp::ClearReadErrorInjectOp( + int shard, const std::optional<uint64_t>& type) + : ClearErrorInjectOp<OpType::ClearReadErrorInject>(shard, type) {} + +std::unique_ptr<ceph::io_exerciser::ClearReadErrorInjectOp> +ceph::io_exerciser ::ClearReadErrorInjectOp::generate( + int shard, const std::optional<uint64_t>& type) { + return std::make_unique<ClearReadErrorInjectOp>(shard, type); +} + +ceph::io_exerciser::ClearWriteErrorInjectOp::ClearWriteErrorInjectOp( + int shard, const std::optional<uint64_t>& type) + : ClearErrorInjectOp<OpType::ClearWriteErrorInject>(shard, type) {} + +std::unique_ptr<ceph::io_exerciser::ClearWriteErrorInjectOp> +ceph::io_exerciser ::ClearWriteErrorInjectOp::generate( + int shard, const std::optional<uint64_t>& type) { + return std::make_unique<ClearWriteErrorInjectOp>(shard, type); }
\ No newline at end of file diff --git a/src/common/io_exerciser/IoOp.h b/src/common/io_exerciser/IoOp.h index 60c02a93d4e..1887eafcc1f 100644 --- a/src/common/io_exerciser/IoOp.h +++ b/src/common/io_exerciser/IoOp.h @@ -1,94 +1,248 @@ #pragma once -#include <string> +#include <array> #include <memory> -#include "include/ceph_assert.h" +#include <optional> +#include <string> + +#include "OpType.h" /* Overview * - * enum OpType - * Enumeration of different types of I/O operation - * * class IoOp * Stores details for an I/O operation. Generated by IoSequences * and applied by IoExerciser's */ namespace ceph { - namespace io_exerciser { - - enum class OpType { - Done, // End of I/O sequence - BARRIER, // Barrier - all prior I/Os must complete - CREATE, // Create object and pattern with data - REMOVE, // Remove object - READ, // Read - READ2, // 2 Reads in one op - READ3, // 3 Reads in one op - WRITE, // Write - WRITE2, // 2 Writes in one op - WRITE3 // 3 Writes in one op - }; - - class IoOp { - protected: - std::string value_to_string(uint64_t v) const; - - public: - OpType op; - uint64_t offset1; - uint64_t length1; - uint64_t offset2; - uint64_t length2; - uint64_t offset3; - uint64_t length3; - - IoOp( OpType op, - uint64_t offset1 = 0, uint64_t length1 = 0, - uint64_t offset2 = 0, uint64_t length2 = 0, - uint64_t offset3 = 0, uint64_t length3 = 0 ); - - static std::unique_ptr<IoOp> generate_done(); - - static std::unique_ptr<IoOp> generate_barrier(); - - static std::unique_ptr<IoOp> generate_create(uint64_t size); - - static std::unique_ptr<IoOp> generate_remove(); - - static std::unique_ptr<IoOp> generate_read(uint64_t offset, +namespace io_exerciser { + +class IoOp { + public: + IoOp(); + virtual ~IoOp() = default; + virtual std::string to_string(uint64_t block_size) const = 0; + virtual constexpr OpType getOpType() const = 0; +}; + +template <OpType opType> +class TestOp : public IoOp { + public: + TestOp(); + constexpr OpType getOpType() const override { return opType; } +}; + +class DoneOp : public TestOp<OpType::Done> { + public: + DoneOp(); + static std::unique_ptr<DoneOp> generate(); + std::string to_string(uint64_t block_size) const override; +}; + +class BarrierOp : public TestOp<OpType::Barrier> { + public: + BarrierOp(); + static std::unique_ptr<BarrierOp> generate(); + std::string to_string(uint64_t block_size) const override; +}; + +class CreateOp : public TestOp<OpType::Create> { + public: + CreateOp(uint64_t size); + static std::unique_ptr<CreateOp> generate(uint64_t size); + std::string to_string(uint64_t block_size) const override; + uint64_t size; +}; + +class RemoveOp : public TestOp<OpType::Remove> { + public: + RemoveOp(); + static std::unique_ptr<RemoveOp> generate(); + std::string to_string(uint64_t block_size) const override; +}; + +template <OpType opType, int numIOs> +class ReadWriteOp : public TestOp<opType> { + public: + std::array<uint64_t, numIOs> offset; + std::array<uint64_t, numIOs> length; + + protected: + ReadWriteOp(std::array<uint64_t, numIOs>&& offset, + std::array<uint64_t, numIOs>&& length); + std::string to_string(uint64_t block_size) const override; +}; + +class SingleReadOp : public ReadWriteOp<OpType::Read, 1> { + public: + SingleReadOp(uint64_t offset, uint64_t length); + static std::unique_ptr<SingleReadOp> generate(uint64_t offset, + uint64_t length); +}; + +class DoubleReadOp : public ReadWriteOp<OpType::Read2, 2> { + public: + DoubleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2); + static std::unique_ptr<DoubleReadOp> generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2); +}; + +class TripleReadOp : public ReadWriteOp<OpType::Read3, 3> { + public: + TripleReadOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3); + static std::unique_ptr<TripleReadOp> generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3); +}; + +class SingleWriteOp : public ReadWriteOp<OpType::Write, 1> { + public: + SingleWriteOp(uint64_t offset, uint64_t length); + static std::unique_ptr<SingleWriteOp> generate(uint64_t offset, uint64_t length); +}; + +class DoubleWriteOp : public ReadWriteOp<OpType::Write2, 2> { + public: + DoubleWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2); + static std::unique_ptr<DoubleWriteOp> generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2); +}; + +class TripleWriteOp : public ReadWriteOp<OpType::Write3, 3> { + public: + TripleWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3); + static std::unique_ptr<TripleWriteOp> generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3); +}; + +class SingleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite, 1> { + public: + SingleFailedWriteOp(uint64_t offset, uint64_t length); + static std::unique_ptr<SingleFailedWriteOp> generate(uint64_t offset, + uint64_t length); +}; + +class DoubleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite2, 2> { + public: + DoubleFailedWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2); + static std::unique_ptr<DoubleFailedWriteOp> generate(uint64_t offset1, + uint64_t length1, + uint64_t offset2, + uint64_t length2); +}; + +class TripleFailedWriteOp : public ReadWriteOp<OpType::FailedWrite3, 3> { + public: + TripleFailedWriteOp(uint64_t offset1, uint64_t length1, uint64_t offset2, + uint64_t length2, uint64_t offset3, uint64_t length3); + static std::unique_ptr<TripleFailedWriteOp> generate( + uint64_t offset1, uint64_t length1, uint64_t offset2, uint64_t length2, + uint64_t offset3, uint64_t length3); +}; + +template <ceph::io_exerciser::OpType opType> +class InjectErrorOp : public TestOp<opType> { + public: + InjectErrorOp(int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + std::string to_string(uint64_t block_size) const override; + + int shard; + std::optional<uint64_t> type; + std::optional<uint64_t> when; + std::optional<uint64_t> duration; + + protected: + virtual inline constexpr std::string_view get_inject_type_string() const = 0; +}; + +class InjectReadErrorOp : public InjectErrorOp<OpType::InjectReadError> { + public: + InjectReadErrorOp(int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + static std::unique_ptr<InjectReadErrorOp> generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "read"; + } +}; + +class InjectWriteErrorOp : public InjectErrorOp<OpType::InjectWriteError> { + public: + InjectWriteErrorOp(int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + static std::unique_ptr<InjectWriteErrorOp> generate( + int shard, const std::optional<uint64_t>& type, + const std::optional<uint64_t>& when, + const std::optional<uint64_t>& duration); + + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "write"; + } +}; + +template <ceph::io_exerciser::OpType opType> +class ClearErrorInjectOp : public TestOp<opType> { + public: + ClearErrorInjectOp(int shard, const std::optional<uint64_t>& type); + + std::string to_string(uint64_t block_size) const override; + + int shard; + std::optional<uint64_t> type; + + protected: + virtual inline constexpr std::string_view get_inject_type_string() const = 0; +}; + +class ClearReadErrorInjectOp + : public ClearErrorInjectOp<OpType::ClearReadErrorInject> { + public: + ClearReadErrorInjectOp(int shard, const std::optional<uint64_t>& type); + + static std::unique_ptr<ClearReadErrorInjectOp> generate( + int shard, const std::optional<uint64_t>& type); + + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "read"; + } +}; + +class ClearWriteErrorInjectOp + : public ClearErrorInjectOp<OpType::ClearWriteErrorInject> { + public: + ClearWriteErrorInjectOp(int shard, const std::optional<uint64_t>& type); + + static std::unique_ptr<ClearWriteErrorInjectOp> generate( + int shard, const std::optional<uint64_t>& type); - static std::unique_ptr<IoOp> generate_read2(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2); - - static std::unique_ptr<IoOp> generate_read3(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2, - uint64_t offset3, - uint64_t length3); - - static std::unique_ptr<IoOp> generate_write(uint64_t offset, - uint64_t length); - - static std::unique_ptr<IoOp> generate_write2(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2); - - static std::unique_ptr<IoOp> generate_write3(uint64_t offset1, - uint64_t length1, - uint64_t offset2, - uint64_t length2, - uint64_t offset3, - uint64_t length3); - - bool done(); - - std::string to_string(uint64_t block_size) const; - }; + protected: + inline constexpr std::string_view get_inject_type_string() const override { + return "write"; } -}
\ No newline at end of file +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/IoSequence.cc b/src/common/io_exerciser/IoSequence.cc index 4a7ca0593d1..83f1cc595a5 100644 --- a/src/common/io_exerciser/IoSequence.cc +++ b/src/common/io_exerciser/IoSequence.cc @@ -1,12 +1,12 @@ #include "IoSequence.h" +using IoOp = ceph::io_exerciser::IoOp; using Sequence = ceph::io_exerciser::Sequence; using IoSequence = ceph::io_exerciser::IoSequence; -std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& seq) -{ - switch (seq) - { +std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, + const Sequence& seq) { + switch (seq) { case Sequence::SEQUENCE_SEQ0: os << "SEQUENCE_SEQ0"; break; @@ -37,6 +37,9 @@ std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& s case Sequence::SEQUENCE_SEQ9: os << "SEQUENCE_SEQ9"; break; + case Sequence::SEQUENCE_SEQ10: + os << "SEQUENCE_SEQ10"; + break; case Sequence::SEQUENCE_END: os << "SEQUENCE_END"; break; @@ -44,19 +47,12 @@ std::ostream& ceph::io_exerciser::operator<<(std::ostream& os, const Sequence& s return os; } -IoSequence::IoSequence(std::pair<int,int> obj_size_range, - int seed) : - min_obj_size(obj_size_range.first), max_obj_size(obj_size_range.second), - create(true), barrier(false), done(false), remove(false), - obj_size(min_obj_size), step(-1), seed(seed) -{ - rng.seed(seed); +bool IoSequence::is_supported(Sequence sequence) const { + return sequence != Sequence::SEQUENCE_SEQ10; } -std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s, - std::pair<int,int> obj_size_range, - int seed) -{ +std::unique_ptr<IoSequence> IoSequence::generate_sequence( + Sequence s, std::pair<int, int> obj_size_range, int seed) { switch (s) { case Sequence::SEQUENCE_SEQ0: return std::make_unique<Seq0>(obj_size_range, seed); @@ -78,24 +74,39 @@ std::unique_ptr<IoSequence> IoSequence::generate_sequence(Sequence s, return std::make_unique<Seq8>(obj_size_range, seed); case Sequence::SEQUENCE_SEQ9: return std::make_unique<Seq9>(obj_size_range, seed); + case Sequence::SEQUENCE_SEQ10: + ceph_abort_msg( + "Sequence 10 only supported for erasure coded pools " + "through the EcIoSequence interface"); + return nullptr; default: break; } return nullptr; } -int IoSequence::get_step() const -{ - return step; +IoSequence::IoSequence(std::pair<int, int> obj_size_range, int seed) + : min_obj_size(obj_size_range.first), + max_obj_size(obj_size_range.second), + create(true), + barrier(false), + done(false), + remove(false), + obj_size(min_obj_size), + step(-1), + seed(seed) { + rng.seed(seed); } -int IoSequence::get_seed() const -{ - return seed; +std::string ceph::io_exerciser::IoSequence::get_name_with_seqseed() const { + return get_name() + " (seqseed " + std::to_string(get_seed()) + ")"; } -void IoSequence::set_min_object_size(uint64_t size) -{ +int IoSequence::get_step() const { return step; } + +int IoSequence::get_seed() const { return seed; } + +void IoSequence::set_min_object_size(uint64_t size) { min_obj_size = size; if (obj_size < size) { obj_size = size; @@ -105,23 +116,20 @@ void IoSequence::set_min_object_size(uint64_t size) } } -void IoSequence::set_max_object_size(uint64_t size) -{ +void IoSequence::set_max_object_size(uint64_t size) { max_obj_size = size; if (obj_size > size) { done = true; } } -void IoSequence::select_random_object_size() -{ +void IoSequence::select_random_object_size() { if (max_obj_size != min_obj_size) { obj_size = min_obj_size + rng(max_obj_size - min_obj_size); } } -std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size() -{ +std::unique_ptr<IoOp> IoSequence::increment_object_size() { obj_size++; if (obj_size > max_obj_size) { done = true; @@ -129,106 +137,118 @@ std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::increment_object_size() create = true; barrier = true; remove = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } -std::unique_ptr<ceph::io_exerciser::IoOp> IoSequence::next() -{ +Sequence IoSequence::getNextSupportedSequenceId() const { + Sequence sequence = get_id(); + ++sequence; + for (; sequence < Sequence::SEQUENCE_END; ++sequence) { + if (is_supported(sequence)) { + return sequence; + } + } + + return Sequence::SEQUENCE_END; +} + +std::unique_ptr<IoOp> IoSequence::next() { step++; if (remove) { remove = false; - return IoOp::generate_remove(); + return RemoveOp::generate(); } if (barrier) { barrier = false; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } if (done) { - return IoOp::generate_done(); + return DoneOp::generate(); } if (create) { create = false; barrier = true; - return IoOp::generate_create(obj_size); + return CreateOp::generate(obj_size); } return _next(); } - - -ceph::io_exerciser::Seq0::Seq0(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0) -{ +ceph::io_exerciser::Seq0::Seq0(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset(0) { select_random_object_size(); length = 1 + rng(obj_size - 1); } -std::string ceph::io_exerciser::Seq0::get_name() const -{ +Sequence ceph::io_exerciser::Seq0::get_id() const { + return Sequence::SEQUENCE_SEQ0; +} + +std::string ceph::io_exerciser::Seq0::get_name() const { return "Sequential reads of length " + std::to_string(length) + - " with queue depth 1 (seqseed " + std::to_string(get_seed()) + ")"; + " with queue depth 1"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq0::_next() { std::unique_ptr<IoOp> r; if (offset >= obj_size) { done = true; barrier = true; remove = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } if (offset + length > obj_size) { - r = IoOp::generate_read(offset, obj_size - offset); + r = SingleReadOp::generate(offset, obj_size - offset); } else { - r = IoOp::generate_read(offset, length); + r = SingleReadOp::generate(offset, length); } offset += length; return r; } - - -ceph::io_exerciser::Seq1::Seq1(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed) -{ +ceph::io_exerciser::Seq1::Seq1(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed) { select_random_object_size(); count = 3 * obj_size; } -std::string ceph::io_exerciser::Seq1::get_name() const -{ - return "Random offset, random length read/write I/O with queue depth 1 (seqseed " - + std::to_string(get_seed()) + ")"; +Sequence ceph::io_exerciser::Seq1::get_id() const { + return Sequence::SEQUENCE_SEQ1; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next() -{ +std::string ceph::io_exerciser::Seq1::get_name() const { + return "Random offset, random length read/write I/O with queue depth 1"; +} + +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq1::_next() { barrier = true; if (count-- == 0) { done = true; remove = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } uint64_t offset = rng(obj_size - 1); uint64_t length = 1 + rng(obj_size - 1 - offset); - return (rng(2) != 0) ? IoOp::generate_write(offset, length) : - IoOp::generate_read(offset, length); -} + if (rng(2) != 0) { + return SingleWriteOp::generate(offset, length); + } else { + return SingleReadOp::generate(offset, length); + } +} +ceph::io_exerciser::Seq2::Seq2(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset(0), length(0) {} -ceph::io_exerciser::Seq2::Seq2(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(0) {} +Sequence ceph::io_exerciser::Seq2::get_id() const { + return Sequence::SEQUENCE_SEQ2; +} -std::string ceph::io_exerciser::Seq2::get_name() const -{ +std::string ceph::io_exerciser::Seq2::get_name() const { return "Permutations of offset and length read I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() { length++; if (length > obj_size - offset) { length = 1; @@ -239,24 +259,23 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq2::_next() return increment_object_size(); } } - return IoOp::generate_read(offset, length); + return SingleReadOp::generate(offset, length); } - - -ceph::io_exerciser::Seq3::Seq3(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset1(0), offset2(0) -{ +ceph::io_exerciser::Seq3::Seq3(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset1(0), offset2(0) { set_min_object_size(2); } -std::string ceph::io_exerciser::Seq3::get_name() const -{ +Sequence ceph::io_exerciser::Seq3::get_id() const { + return Sequence::SEQUENCE_SEQ3; +} + +std::string ceph::io_exerciser::Seq3::get_name() const { return "Permutations of offset 2-region 1-block read I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() { offset2++; if (offset2 >= obj_size - offset1) { offset2 = 1; @@ -267,24 +286,23 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq3::_next() return increment_object_size(); } } - return IoOp::generate_read2(offset1, 1, offset1 + offset2, 1); + return DoubleReadOp::generate(offset1, 1, offset1 + offset2, 1); } - - -ceph::io_exerciser::Seq4::Seq4(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset1(0), offset2(1) -{ +ceph::io_exerciser::Seq4::Seq4(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset1(0), offset2(1) { set_min_object_size(3); } -std::string ceph::io_exerciser::Seq4::get_name() const -{ +Sequence ceph::io_exerciser::Seq4::get_id() const { + return Sequence::SEQUENCE_SEQ4; +} + +std::string ceph::io_exerciser::Seq4::get_name() const { return "Permutations of offset 3-region 1-block read I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() { offset2++; if (offset2 >= obj_size - offset1) { offset2 = 2; @@ -295,33 +313,35 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq4::_next() return increment_object_size(); } } - return IoOp::generate_read3(offset1, 1, - offset1 + offset2, 1, - (offset1 * 2 + offset2)/2, 1); + return TripleReadOp::generate(offset1, 1, (offset1 + offset2), 1, + (offset1 * 2 + offset2) / 2, 1); } +ceph::io_exerciser::Seq5::Seq5(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), + offset(0), + length(1), + doneread(false), + donebarrier(false) {} +Sequence ceph::io_exerciser::Seq5::get_id() const { + return Sequence::SEQUENCE_SEQ5; +} -ceph::io_exerciser::Seq5::Seq5(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(1), - doneread(false), donebarrier(false) {} - -std::string ceph::io_exerciser::Seq5::get_name() const -{ +std::string ceph::io_exerciser::Seq5::get_name() const { return "Permutation of length sequential writes"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() { if (offset >= obj_size) { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } doneread = false; donebarrier = false; @@ -333,33 +353,36 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq5::_next() } } uint64_t io_len = (offset + length > obj_size) ? (obj_size - offset) : length; - std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len); + std::unique_ptr<IoOp> r = SingleWriteOp::generate(offset, io_len); offset += io_len; return r; } +ceph::io_exerciser::Seq6::Seq6(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), + offset(0), + length(1), + doneread(false), + donebarrier(false) {} +Sequence ceph::io_exerciser::Seq6::get_id() const { + return Sequence::SEQUENCE_SEQ6; +} -ceph::io_exerciser::Seq6::Seq6(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(1), - doneread(false), donebarrier(false) {} - -std::string ceph::io_exerciser::Seq6::get_name() const -{ +std::string ceph::io_exerciser::Seq6::get_name() const { return "Permutation of length sequential writes, different alignment"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() { if (offset >= obj_size) { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } doneread = false; donebarrier = false; @@ -374,74 +397,72 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq6::_next() if (io_len == 0) { io_len = length; } - std::unique_ptr<IoOp> r = IoOp::generate_write(offset, io_len); + std::unique_ptr<IoOp> r = SingleWriteOp::generate(offset, io_len); offset += io_len; return r; } - - -ceph::io_exerciser::Seq7::Seq7(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed) -{ +ceph::io_exerciser::Seq7::Seq7(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed) { set_min_object_size(2); offset = obj_size; } -std::string ceph::io_exerciser::Seq7::get_name() const -{ +Sequence ceph::io_exerciser::Seq7::get_id() const { + return Sequence::SEQUENCE_SEQ7; +} + +std::string ceph::io_exerciser::Seq7::get_name() const { return "Permutations of offset 2-region 1-block writes"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq7::_next() { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } if (offset == 0) { doneread = false; donebarrier = false; - offset = obj_size+1; + offset = obj_size + 1; return increment_object_size(); } offset--; - if (offset == obj_size/2) { + if (offset == obj_size / 2) { return _next(); } doneread = false; donebarrier = false; - return IoOp::generate_write2(offset, 1, obj_size/2, 1); + return DoubleReadOp::generate(offset, 1, obj_size / 2, 1); } - - -ceph::io_exerciser::Seq8::Seq8(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset1(0), offset2(1) -{ +ceph::io_exerciser::Seq8::Seq8(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset1(0), offset2(1) { set_min_object_size(3); } -std::string ceph::io_exerciser::Seq8::get_name() const -{ +Sequence ceph::io_exerciser::Seq8::get_id() const { + return Sequence::SEQUENCE_SEQ8; +} + +std::string ceph::io_exerciser::Seq8::get_name() const { return "Permutations of offset 3-region 1-block write I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } offset2++; if (offset2 >= obj_size - offset1) { @@ -455,34 +476,30 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq8::_next() } doneread = false; donebarrier = false; - return IoOp::generate_write3(offset1, 1, - offset1 + offset2, 1, - (offset1 * 2 + offset2)/2, 1); + return TripleWriteOp::generate(offset1, 1, offset1 + offset2, 1, + (offset1 * 2 + offset2) / 2, 1); } +ceph::io_exerciser::Seq9::Seq9(std::pair<int, int> obj_size_range, int seed) + : IoSequence(obj_size_range, seed), offset(0), length(0) {} - -ceph::io_exerciser::Seq9::Seq9(std::pair<int,int> obj_size_range, int seed) : - IoSequence(obj_size_range, seed), offset(0), length(0) -{ - +Sequence ceph::io_exerciser::Seq9::get_id() const { + return Sequence::SEQUENCE_SEQ9; } -std::string ceph::io_exerciser::Seq9::get_name() const -{ +std::string ceph::io_exerciser::Seq9::get_name() const { return "Permutations of offset and length write I/O"; } -std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() -{ +std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() { if (!doneread) { if (!donebarrier) { donebarrier = true; - return IoOp::generate_barrier(); + return BarrierOp::generate(); } doneread = true; barrier = true; - return IoOp::generate_read(0, obj_size); + return SingleReadOp::generate(0, obj_size); } length++; if (length > obj_size - offset) { @@ -496,5 +513,5 @@ std::unique_ptr<ceph::io_exerciser::IoOp> ceph::io_exerciser::Seq9::_next() } doneread = false; donebarrier = false; - return IoOp::generate_write(offset, length); + return SingleWriteOp::generate(offset, length); }
\ No newline at end of file diff --git a/src/common/io_exerciser/IoSequence.h b/src/common/io_exerciser/IoSequence.h index 114ff76303f..b6c254cf096 100644 --- a/src/common/io_exerciser/IoSequence.h +++ b/src/common/io_exerciser/IoSequence.h @@ -3,7 +3,6 @@ #pragma once #include "IoOp.h" - #include "include/random.h" /* Overview @@ -29,195 +28,209 @@ */ namespace ceph { - namespace io_exerciser { - - enum class Sequence { - SEQUENCE_SEQ0, - SEQUENCE_SEQ1, - SEQUENCE_SEQ2, - SEQUENCE_SEQ3, - SEQUENCE_SEQ4, - SEQUENCE_SEQ5, - SEQUENCE_SEQ6, - SEQUENCE_SEQ7, - SEQUENCE_SEQ8, - SEQUENCE_SEQ9, - // - SEQUENCE_END, - SEQUENCE_BEGIN = SEQUENCE_SEQ0 - }; - - inline Sequence operator++( Sequence& s ) - { - return s = (Sequence)(((int)(s) + 1)); - } - - std::ostream& operator<<(std::ostream& os, const Sequence& seq); - - /* I/O Sequences */ - - class IoSequence { - public: - virtual ~IoSequence() = default; - - virtual std::string get_name() const = 0; - int get_step() const; - int get_seed() const; - - std::unique_ptr<IoOp> next(); - - static std::unique_ptr<IoSequence> - generate_sequence(Sequence s, std::pair<int,int> obj_size_range, int seed ); - - protected: - uint64_t min_obj_size; - uint64_t max_obj_size; - bool create; - bool barrier; - bool done; - bool remove; - uint64_t obj_size; - int step; - int seed; - ceph::util::random_number_generator<int> rng = - ceph::util::random_number_generator<int>(); - - IoSequence(std::pair<int,int> obj_size_range, int seed); - - virtual std::unique_ptr<IoOp> _next() = 0; - - void set_min_object_size(uint64_t size); - void set_max_object_size(uint64_t size); - void select_random_object_size(); - std::unique_ptr<IoOp> increment_object_size(); - - }; - - class Seq0: public IoSequence { - public: - Seq0(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - }; - - class Seq1: public IoSequence { - public: - Seq1(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next(); - - private: - int count; - }; - - class Seq2: public IoSequence { - public: - Seq2(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - }; - - class Seq3: public IoSequence { - public: - Seq3(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - private: - uint64_t offset1; - uint64_t offset2; - }; - - class Seq4: public IoSequence { - public: - Seq4(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset1; - uint64_t offset2; - }; - - class Seq5: public IoSequence { - public: - Seq5(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - bool doneread; - bool donebarrier; - }; - - class Seq6: public IoSequence { - public: - Seq6(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - uint64_t length; - bool doneread; - bool donebarrier; - }; - - class Seq7: public IoSequence { - public: - Seq7(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - - private: - uint64_t offset; - bool doneread = true; - bool donebarrier = false; - }; - - class Seq8: public IoSequence { - public: - Seq8(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - std::unique_ptr<IoOp> _next() override; - private: - uint64_t offset1; - uint64_t offset2; - bool doneread = true; - bool donebarrier = false; - }; - - class Seq9: public IoSequence { - private: - uint64_t offset; - uint64_t length; - bool doneread = true; - bool donebarrier = false; - - public: - Seq9(std::pair<int,int> obj_size_range, int seed); - - std::string get_name() const override; - - std::unique_ptr<IoOp> _next() override; - }; - } -}
\ No newline at end of file +namespace io_exerciser { + +enum class Sequence { + SEQUENCE_SEQ0, + SEQUENCE_SEQ1, + SEQUENCE_SEQ2, + SEQUENCE_SEQ3, + SEQUENCE_SEQ4, + SEQUENCE_SEQ5, + SEQUENCE_SEQ6, + SEQUENCE_SEQ7, + SEQUENCE_SEQ8, + SEQUENCE_SEQ9, + SEQUENCE_SEQ10, + + SEQUENCE_END, + SEQUENCE_BEGIN = SEQUENCE_SEQ0 +}; + +inline Sequence operator++(Sequence& s) { + return s = (Sequence)(((int)(s) + 1)); +} + +std::ostream& operator<<(std::ostream& os, const Sequence& seq); + +/* I/O Sequences */ + +class IoSequence { + public: + virtual ~IoSequence() = default; + + virtual Sequence get_id() const = 0; + virtual std::string get_name_with_seqseed() const; + virtual std::string get_name() const = 0; + int get_step() const; + int get_seed() const; + + virtual Sequence getNextSupportedSequenceId() const; + virtual std::unique_ptr<IoOp> next(); + + virtual bool is_supported(Sequence sequence) const; + static std::unique_ptr<IoSequence> generate_sequence( + Sequence s, std::pair<int, int> obj_size_range, int seed); + + protected: + uint64_t min_obj_size; + uint64_t max_obj_size; + bool create; + bool barrier; + bool done; + bool remove; + uint64_t obj_size; + int step; + int seed; + ceph::util::random_number_generator<int> rng = + ceph::util::random_number_generator<int>(); + + IoSequence(std::pair<int, int> obj_size_range, int seed); + + virtual std::unique_ptr<IoOp> _next() = 0; + + void set_min_object_size(uint64_t size); + void set_max_object_size(uint64_t size); + void select_random_object_size(); + std::unique_ptr<IoOp> increment_object_size(); +}; + +class Seq0 : public IoSequence { + public: + Seq0(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; +}; + +class Seq1 : public IoSequence { + public: + Seq1(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + int count; +}; + +class Seq2 : public IoSequence { + public: + Seq2(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; +}; + +class Seq3 : public IoSequence { + public: + Seq3(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset1; + uint64_t offset2; +}; + +class Seq4 : public IoSequence { + public: + Seq4(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset1; + uint64_t offset2; +}; + +class Seq5 : public IoSequence { + public: + Seq5(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; + bool doneread; + bool donebarrier; +}; + +class Seq6 : public IoSequence { + public: + Seq6(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + uint64_t length; + bool doneread; + bool donebarrier; +}; + +class Seq7 : public IoSequence { + public: + Seq7(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset; + bool doneread = true; + bool donebarrier = false; +}; + +class Seq8 : public IoSequence { + public: + Seq8(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; + + private: + uint64_t offset1; + uint64_t offset2; + bool doneread = true; + bool donebarrier = false; +}; + +class Seq9 : public IoSequence { + private: + uint64_t offset; + uint64_t length; + bool doneread = true; + bool donebarrier = false; + + public: + Seq9(std::pair<int, int> obj_size_range, int seed); + + Sequence get_id() const override; + std::string get_name() const override; + std::unique_ptr<IoOp> _next() override; +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/Model.cc b/src/common/io_exerciser/Model.cc index 50812ecbb15..6548e1eda7a 100644 --- a/src/common/io_exerciser/Model.cc +++ b/src/common/io_exerciser/Model.cc @@ -4,25 +4,11 @@ using Model = ceph::io_exerciser::Model; -Model::Model(const std::string& oid, uint64_t block_size) : -num_io(0), -oid(oid), -block_size(block_size) -{ +Model::Model(const std::string& oid, uint64_t block_size) + : num_io(0), oid(oid), block_size(block_size) {} -} +const uint64_t Model::get_block_size() const { return block_size; } -const uint64_t Model::get_block_size() const -{ - return block_size; -} +const std::string Model::get_oid() const { return oid; } -const std::string Model::get_oid() const -{ - return oid; -} - -int Model::get_num_io() const -{ - return num_io; -}
\ No newline at end of file +int Model::get_num_io() const { return num_io; }
\ No newline at end of file diff --git a/src/common/io_exerciser/Model.h b/src/common/io_exerciser/Model.h index 58d107409a6..9e421e79a78 100644 --- a/src/common/io_exerciser/Model.h +++ b/src/common/io_exerciser/Model.h @@ -1,15 +1,13 @@ #pragma once -#include "IoOp.h" - #include <boost/asio/io_context.hpp> -#include "librados/librados_asio.h" - -#include "include/interval_set.h" -#include "global/global_init.h" -#include "global/global_context.h" +#include "IoOp.h" #include "common/Thread.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/interval_set.h" +#include "librados/librados_asio.h" /* Overview * @@ -21,29 +19,27 @@ */ namespace ceph { - namespace io_exerciser { - - class Model - { - protected: - int num_io{0}; - std::string oid; - uint64_t block_size; - - public: - Model(const std::string& oid, uint64_t block_size); - virtual ~Model() = default; - - virtual bool readyForIoOp(IoOp& op) = 0; - virtual void applyIoOp(IoOp& op) = 0; - - const std::string get_oid() const; - const uint64_t get_block_size() const; - int get_num_io() const; - }; - - /* Simple RADOS I/O generator */ - - - } -}
\ No newline at end of file +namespace io_exerciser { + +class Model { + protected: + int num_io{0}; + std::string oid; + uint64_t block_size; + + public: + Model(const std::string& oid, uint64_t block_size); + virtual ~Model() = default; + + virtual bool readyForIoOp(IoOp& op) = 0; + virtual void applyIoOp(IoOp& op) = 0; + + const std::string get_oid() const; + const uint64_t get_block_size() const; + int get_num_io() const; +}; + +/* Simple RADOS I/O generator */ + +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/ObjectModel.cc b/src/common/io_exerciser/ObjectModel.cc index 589f6434282..454d7254cf2 100644 --- a/src/common/io_exerciser/ObjectModel.cc +++ b/src/common/io_exerciser/ObjectModel.cc @@ -6,25 +6,20 @@ using ObjectModel = ceph::io_exerciser::ObjectModel; -ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) : - Model(oid, block_size), created(false) -{ +ObjectModel::ObjectModel(const std::string& oid, uint64_t block_size, int seed) + : Model(oid, block_size), created(false) { rng.seed(seed); } -int ObjectModel::get_seed(uint64_t offset) const -{ +int ObjectModel::get_seed(uint64_t offset) const { ceph_assert(offset < contents.size()); return contents[offset]; } -std::vector<int> ObjectModel::get_seed_offsets(int seed) const -{ +std::vector<int> ObjectModel::get_seed_offsets(int seed) const { std::vector<int> offsets; - for (size_t i = 0; i < contents.size(); i++) - { - if (contents[i] == seed) - { + for (size_t i = 0; i < contents.size(); i++) { + if (contents[i] == seed) { offsets.push_back(i); } } @@ -32,8 +27,7 @@ std::vector<int> ObjectModel::get_seed_offsets(int seed) const return offsets; } -std::string ObjectModel::to_string(int mask) const -{ +std::string ObjectModel::to_string(int mask) const { if (!created) { return "Object does not exist"; } @@ -48,107 +42,127 @@ std::string ObjectModel::to_string(int mask) const return result; } -bool ObjectModel::readyForIoOp(IoOp& op) -{ - return true; -} - -void ObjectModel::applyIoOp(IoOp& op) -{ - auto generate_random = [&rng = rng]() { - return rng(); - }; - - switch (op.op) { - case OpType::BARRIER: - reads.clear(); - writes.clear(); - break; - - case OpType::CREATE: - ceph_assert(!created); - ceph_assert(reads.empty()); - ceph_assert(writes.empty()); - created = true; - contents.resize(op.length1); - std::generate(std::execution::seq, contents.begin(), contents.end(), - generate_random); - break; - - case OpType::REMOVE: - ceph_assert(created); - ceph_assert(reads.empty()); - ceph_assert(writes.empty()); - created = false; - contents.resize(0); - break; - - case OpType::READ3: - ceph_assert(created); - ceph_assert(op.offset3 + op.length3 <= contents.size()); - // Not allowed: read overlapping with parallel write - ceph_assert(!writes.intersects(op.offset3, op.length3)); - reads.union_insert(op.offset3, op.length3); - [[fallthrough]]; - - case OpType::READ2: - ceph_assert(created); - ceph_assert(op.offset2 + op.length2 <= contents.size()); - // Not allowed: read overlapping with parallel write - ceph_assert(!writes.intersects(op.offset2, op.length2)); - reads.union_insert(op.offset2, op.length2); - [[fallthrough]]; - - case OpType::READ: - ceph_assert(created); - ceph_assert(op.offset1 + op.length1 <= contents.size()); - // Not allowed: read overlapping with parallel write - ceph_assert(!writes.intersects(op.offset1, op.length1)); - reads.union_insert(op.offset1, op.length1); - num_io++; - break; - - case OpType::WRITE3: - ceph_assert(created); - // Not allowed: write overlapping with parallel read or write - ceph_assert(!reads.intersects(op.offset3, op.length3)); - ceph_assert(!writes.intersects(op.offset3, op.length3)); - writes.union_insert(op.offset3, op.length3); - ceph_assert(op.offset3 + op.length3 <= contents.size()); - std::generate(std::execution::seq, - std::next(contents.begin(), op.offset3), - std::next(contents.begin(), op.offset3 + op.length3), - generate_random); - [[fallthrough]]; - - case OpType::WRITE2: - ceph_assert(created); - // Not allowed: write overlapping with parallel read or write - ceph_assert(!reads.intersects(op.offset2, op.length2)); - ceph_assert(!writes.intersects(op.offset2, op.length2)); - writes.union_insert(op.offset2, op.length2); - ceph_assert(op.offset2 + op.length2 <= contents.size()); - std::generate(std::execution::seq, - std::next(contents.begin(), op.offset2), - std::next(contents.begin(), op.offset2 + op.length2), - generate_random); - [[fallthrough]]; - - case OpType::WRITE: - ceph_assert(created); - // Not allowed: write overlapping with parallel read or write - ceph_assert(!reads.intersects(op.offset1, op.length1)); - ceph_assert(!writes.intersects(op.offset1, op.length1)); - writes.union_insert(op.offset1, op.length1); - ceph_assert(op.offset1 + op.length1 <= contents.size()); - std::generate(std::execution::seq, - std::next(contents.begin(), op.offset1), - std::next(contents.begin(), op.offset1 + op.length1), - generate_random); - num_io++; - break; - default: - break; +bool ObjectModel::readyForIoOp(IoOp& op) { return true; } + +void ObjectModel::applyIoOp(IoOp& op) { + auto generate_random = [&rng = rng]() { return rng(); }; + + auto verify_and_record_read_op = + [&contents = contents, &created = created, &num_io = num_io, + &reads = reads, + &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N>& readOp) { + ceph_assert(created); + for (int i = 0; i < N; i++) { + ceph_assert(readOp.offset[i] + readOp.length[i] <= contents.size()); + // Not allowed: read overlapping with parallel write + ceph_assert(!writes.intersects(readOp.offset[i], readOp.length[i])); + reads.union_insert(readOp.offset[i], readOp.length[i]); + } + num_io++; + }; + + auto verify_write_and_record_and_generate_seed = + [&generate_random, &contents = contents, &created = created, + &num_io = num_io, &reads = reads, + &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N> writeOp) { + ceph_assert(created); + for (int i = 0; i < N; i++) { + // Not allowed: write overlapping with parallel read or write + ceph_assert(!reads.intersects(writeOp.offset[i], writeOp.length[i])); + ceph_assert(!writes.intersects(writeOp.offset[i], writeOp.length[i])); + writes.union_insert(writeOp.offset[i], writeOp.length[i]); + ceph_assert(writeOp.offset[i] + writeOp.length[i] <= contents.size()); + std::generate(std::execution::seq, + std::next(contents.begin(), writeOp.offset[i]), + std::next(contents.begin(), + writeOp.offset[i] + writeOp.length[i]), + generate_random); + } + num_io++; + }; + + auto verify_failed_write_and_record = + [&contents = contents, &created = created, &num_io = num_io, + &reads = reads, + &writes = writes]<OpType opType, int N>(ReadWriteOp<opType, N> writeOp) { + // Ensure write should still be valid, even though we are expecting OSD + // failure + ceph_assert(created); + for (int i = 0; i < N; i++) { + // Not allowed: write overlapping with parallel read or write + ceph_assert(!reads.intersects(writeOp.offset[i], writeOp.length[i])); + ceph_assert(!writes.intersects(writeOp.offset[i], writeOp.length[i])); + writes.union_insert(writeOp.offset[i], writeOp.length[i]); + ceph_assert(writeOp.offset[i] + writeOp.length[i] <= contents.size()); + } + num_io++; + }; + + switch (op.getOpType()) { + case OpType::Barrier: + reads.clear(); + writes.clear(); + break; + + case OpType::Create: + ceph_assert(!created); + ceph_assert(reads.empty()); + ceph_assert(writes.empty()); + created = true; + contents.resize(static_cast<CreateOp&>(op).size); + std::generate(std::execution::seq, contents.begin(), contents.end(), + generate_random); + break; + + case OpType::Remove: + ceph_assert(created); + ceph_assert(reads.empty()); + ceph_assert(writes.empty()); + created = false; + contents.resize(0); + break; + + case OpType::Read: { + SingleReadOp& readOp = static_cast<SingleReadOp&>(op); + verify_and_record_read_op(readOp); + } break; + case OpType::Read2: { + DoubleReadOp& readOp = static_cast<DoubleReadOp&>(op); + verify_and_record_read_op(readOp); + } break; + case OpType::Read3: { + TripleReadOp& readOp = static_cast<TripleReadOp&>(op); + verify_and_record_read_op(readOp); + } break; + + case OpType::Write: { + ceph_assert(created); + SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op); + verify_write_and_record_and_generate_seed(writeOp); + } break; + case OpType::Write2: { + DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op); + verify_write_and_record_and_generate_seed(writeOp); + } break; + case OpType::Write3: { + TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op); + verify_write_and_record_and_generate_seed(writeOp); + } break; + case OpType::FailedWrite: { + ceph_assert(created); + SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op); + verify_failed_write_and_record(writeOp); + } break; + case OpType::FailedWrite2: { + DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op); + verify_failed_write_and_record(writeOp); + } break; + case OpType::FailedWrite3: { + TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op); + verify_failed_write_and_record(writeOp); + } break; + default: + break; } } diff --git a/src/common/io_exerciser/ObjectModel.h b/src/common/io_exerciser/ObjectModel.h index 93c70f41429..cad1307b84e 100644 --- a/src/common/io_exerciser/ObjectModel.h +++ b/src/common/io_exerciser/ObjectModel.h @@ -14,40 +14,41 @@ */ namespace ceph { - namespace io_exerciser { - /* Model of an object to track its data contents */ - - class ObjectModel : public Model { - private: - bool created; - std::vector<int> contents; - ceph::util::random_number_generator<int> rng = - ceph::util::random_number_generator<int>(); - - // Track read and write I/Os that can be submitted in - // parallel to detect violations: - // - // * Read may not overlap with a parallel write - // * Write may not overlap with a parallel read or write - // * Create / remove may not be in parallel with read or write - // - // Fix broken test cases by adding barrier ops to restrict - // I/O exercisers from issuing conflicting ops in parallel - interval_set<uint64_t> reads; - interval_set<uint64_t> writes; - public: - ObjectModel(const std::string& oid, uint64_t block_size, int seed); - - int get_seed(uint64_t offset) const; - std::vector<int> get_seed_offsets(int seed) const; - - std::string to_string(int mask = -1) const; - - bool readyForIoOp(IoOp& op); - void applyIoOp(IoOp& op); - - void encode(ceph::buffer::list& bl) const; - void decode(ceph::buffer::list::const_iterator& bl); - }; - } -}
\ No newline at end of file +namespace io_exerciser { +/* Model of an object to track its data contents */ + +class ObjectModel : public Model { + private: + bool created; + std::vector<int> contents; + ceph::util::random_number_generator<int> rng = + ceph::util::random_number_generator<int>(); + + // Track read and write I/Os that can be submitted in + // parallel to detect violations: + // + // * Read may not overlap with a parallel write + // * Write may not overlap with a parallel read or write + // * Create / remove may not be in parallel with read or write + // + // Fix broken test cases by adding barrier ops to restrict + // I/O exercisers from issuing conflicting ops in parallel + interval_set<uint64_t> reads; + interval_set<uint64_t> writes; + + public: + ObjectModel(const std::string& oid, uint64_t block_size, int seed); + + int get_seed(uint64_t offset) const; + std::vector<int> get_seed_offsets(int seed) const; + + std::string to_string(int mask = -1) const; + + bool readyForIoOp(IoOp& op); + void applyIoOp(IoOp& op); + + void encode(ceph::buffer::list& bl) const; + void decode(ceph::buffer::list::const_iterator& bl); +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/io_exerciser/OpType.h b/src/common/io_exerciser/OpType.h new file mode 100644 index 00000000000..7cddb805e45 --- /dev/null +++ b/src/common/io_exerciser/OpType.h @@ -0,0 +1,91 @@ +#pragma once + +#include <fmt/format.h> +#include <include/ceph_assert.h> + +/* Overview + * + * enum OpType + * Enumeration of different types of I/O operation + * + */ + +namespace ceph { +namespace io_exerciser { +enum class OpType { + Done, // End of I/O sequence + Barrier, // Barrier - all prior I/Os must complete + Create, // Create object and pattern with data + Remove, // Remove object + Read, // Read + Read2, // Two reads in a single op + Read3, // Three reads in a single op + Write, // Write + Write2, // Two writes in a single op + Write3, // Three writes in a single op + FailedWrite, // A write which should fail + FailedWrite2, // Two writes in one op which should fail + FailedWrite3, // Three writes in one op which should fail + InjectReadError, // Op to tell OSD to inject read errors + InjectWriteError, // Op to tell OSD to inject write errors + ClearReadErrorInject, // Op to tell OSD to clear read error injects + ClearWriteErrorInject // Op to tell OSD to clear write error injects +}; + +enum class InjectOpType { + None, + ReadEIO, + ReadMissingShard, + WriteFailAndRollback, + WriteOSDAbort +}; +} // namespace io_exerciser +} // namespace ceph + +template <> +struct fmt::formatter<ceph::io_exerciser::OpType> { + constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } + + auto format(ceph::io_exerciser::OpType opType, + fmt::format_context& ctx) const -> fmt::format_context::iterator { + switch (opType) { + case ceph::io_exerciser::OpType::Done: + return fmt::format_to(ctx.out(), "Done"); + case ceph::io_exerciser::OpType::Barrier: + return fmt::format_to(ctx.out(), "Barrier"); + case ceph::io_exerciser::OpType::Create: + return fmt::format_to(ctx.out(), "Create"); + case ceph::io_exerciser::OpType::Remove: + return fmt::format_to(ctx.out(), "Remove"); + case ceph::io_exerciser::OpType::Read: + return fmt::format_to(ctx.out(), "Read"); + case ceph::io_exerciser::OpType::Read2: + return fmt::format_to(ctx.out(), "Read2"); + case ceph::io_exerciser::OpType::Read3: + return fmt::format_to(ctx.out(), "Read3"); + case ceph::io_exerciser::OpType::Write: + return fmt::format_to(ctx.out(), "Write"); + case ceph::io_exerciser::OpType::Write2: + return fmt::format_to(ctx.out(), "Write2"); + case ceph::io_exerciser::OpType::Write3: + return fmt::format_to(ctx.out(), "Write3"); + case ceph::io_exerciser::OpType::FailedWrite: + return fmt::format_to(ctx.out(), "FailedWrite"); + case ceph::io_exerciser::OpType::FailedWrite2: + return fmt::format_to(ctx.out(), "FailedWrite2"); + case ceph::io_exerciser::OpType::FailedWrite3: + return fmt::format_to(ctx.out(), "FailedWrite3"); + case ceph::io_exerciser::OpType::InjectReadError: + return fmt::format_to(ctx.out(), "InjectReadError"); + case ceph::io_exerciser::OpType::InjectWriteError: + return fmt::format_to(ctx.out(), "InjectWriteError"); + case ceph::io_exerciser::OpType::ClearReadErrorInject: + return fmt::format_to(ctx.out(), "ClearReadErrorInject"); + case ceph::io_exerciser::OpType::ClearWriteErrorInject: + return fmt::format_to(ctx.out(), "ClearWriteErrorInject"); + default: + ceph_abort_msg("Unknown OpType"); + return fmt::format_to(ctx.out(), "Unknown OpType"); + } + } +};
\ No newline at end of file diff --git a/src/common/io_exerciser/RadosIo.cc b/src/common/io_exerciser/RadosIo.cc index 44b82260263..4451900b7bb 100644 --- a/src/common/io_exerciser/RadosIo.cc +++ b/src/common/io_exerciser/RadosIo.cc @@ -1,300 +1,429 @@ #include "RadosIo.h" +#include <fmt/format.h> +#include <json_spirit/json_spirit.h> + +#include <ranges> + #include "DataGenerator.h" +#include "common/ceph_json.h" +#include "common/json/OSDStructures.h" using RadosIo = ceph::io_exerciser::RadosIo; -RadosIo::RadosIo(librados::Rados& rados, - boost::asio::io_context& asio, - const std::string& pool, - const std::string& oid, - uint64_t block_size, - int seed, - int threads, - ceph::mutex& lock, - ceph::condition_variable& cond) : - Model(oid, block_size), - rados(rados), - asio(asio), - om(std::make_unique<ObjectModel>(oid, block_size, seed)), - db(data_generation::DataGenerator::create_generator( - data_generation::GenerationType::HeaderedSeededRandom, *om)), - pool(pool), - threads(threads), - lock(lock), - cond(cond), - outstanding_io(0) -{ +RadosIo::RadosIo(librados::Rados& rados, boost::asio::io_context& asio, + const std::string& pool, const std::string& oid, + const std::optional<std::vector<int>>& cached_shard_order, + uint64_t block_size, int seed, int threads, ceph::mutex& lock, + ceph::condition_variable& cond) + : Model(oid, block_size), + rados(rados), + asio(asio), + om(std::make_unique<ObjectModel>(oid, block_size, seed)), + db(data_generation::DataGenerator::create_generator( + data_generation::GenerationType::HeaderedSeededRandom, *om)), + pool(pool), + cached_shard_order(cached_shard_order), + threads(threads), + lock(lock), + cond(cond), + outstanding_io(0) { int rc; rc = rados.ioctx_create(pool.c_str(), io); ceph_assert(rc == 0); allow_ec_overwrites(true); } -RadosIo::~RadosIo() -{ -} +RadosIo::~RadosIo() {} -void RadosIo::start_io() -{ +void RadosIo::start_io() { std::lock_guard l(lock); outstanding_io++; } -void RadosIo::finish_io() -{ +void RadosIo::finish_io() { std::lock_guard l(lock); ceph_assert(outstanding_io > 0); outstanding_io--; cond.notify_all(); } -void RadosIo::wait_for_io(int count) -{ +void RadosIo::wait_for_io(int count) { std::unique_lock l(lock); while (outstanding_io > count) { cond.wait(l); } } -void RadosIo::allow_ec_overwrites(bool allow) -{ +void RadosIo::allow_ec_overwrites(bool allow) { int rc; bufferlist inbl, outbl; - std::string cmdstr = - "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + "\", \ + std::string cmdstr = "{\"prefix\": \"osd pool set\", \"pool\": \"" + pool + + "\", \ \"var\": \"allow_ec_overwrites\", \"val\": \"" + - (allow ? "true" : "false") + "\"}"; + (allow ? "true" : "false") + "\"}"; rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr); ceph_assert(rc == 0); } -RadosIo::AsyncOpInfo::AsyncOpInfo(uint64_t offset1, uint64_t length1, - uint64_t offset2, uint64_t length2, - uint64_t offset3, uint64_t length3 ) : - offset1(offset1), length1(length1), - offset2(offset2), length2(length2), - offset3(offset3), length3(length3) -{ - -} +template <int N> +RadosIo::AsyncOpInfo<N>::AsyncOpInfo(const std::array<uint64_t, N>& offset, + const std::array<uint64_t, N>& length) + : offset(offset), length(length) {} -bool RadosIo::readyForIoOp(IoOp &op) -{ - ceph_assert(ceph_mutex_is_locked_by_me(lock)); //Must be called with lock held +bool RadosIo::readyForIoOp(IoOp& op) { + ceph_assert( + ceph_mutex_is_locked_by_me(lock)); // Must be called with lock held if (!om->readyForIoOp(op)) { return false; } - switch (op.op) { - case OpType::Done: - case OpType::BARRIER: - return outstanding_io == 0; - default: - return outstanding_io < threads; + + switch (op.getOpType()) { + case OpType::Done: + case OpType::Barrier: + return outstanding_io == 0; + default: + return outstanding_io < threads; } } -void RadosIo::applyIoOp(IoOp &op) -{ - std::shared_ptr<AsyncOpInfo> op_info; - +void RadosIo::applyIoOp(IoOp& op) { om->applyIoOp(op); // If there are thread concurrent I/Os in flight then wait for // at least one I/O to complete - wait_for_io(threads-1); - - switch (op.op) { - case OpType::Done: - [[ fallthrough ]]; - case OpType::BARRIER: - // Wait for all outstanding I/O to complete - wait_for_io(0); - break; - - case OpType::CREATE: - { + wait_for_io(threads - 1); + + switch (op.getOpType()) { + case OpType::Done: + [[fallthrough]]; + case OpType::Barrier: + // Wait for all outstanding I/O to complete + wait_for_io(0); + break; + + case OpType::Create: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(0, op.length1); - op_info->bl1 = db->generate_data(0, op.length1); - op_info->wop.write_full(op_info->bl1); - auto create_cb = [this] (boost::system::error_code ec, - version_t ver) { + uint64_t opSize = static_cast<CreateOp&>(op).size; + std::shared_ptr<AsyncOpInfo<1>> op_info = + std::make_shared<AsyncOpInfo<1>>(std::array<uint64_t, 1>{0}, + std::array<uint64_t, 1>{opSize}); + op_info->bufferlist[0] = db->generate_data(0, opSize); + op_info->wop.write_full(op_info->bufferlist[0]); + auto create_cb = [this](boost::system::error_code ec, version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, create_cb); + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, + create_cb); + break; } - break; - case OpType::REMOVE: - { + case OpType::Remove: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(); + auto op_info = std::make_shared<AsyncOpInfo<0>>(); op_info->wop.remove(); - auto remove_cb = [this] (boost::system::error_code ec, - version_t ver) { + auto remove_cb = [this](boost::system::error_code ec, version_t ver) { ceph_assert(ec == boost::system::errc::success); finish_io(); }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, remove_cb); + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, + remove_cb); + break; } - break; + case OpType::Read: + [[fallthrough]]; + case OpType::Read2: + [[fallthrough]]; + case OpType::Read3: + [[fallthrough]]; + case OpType::Write: + [[fallthrough]]; + case OpType::Write2: + [[fallthrough]]; + case OpType::Write3: + [[fallthrough]]; + case OpType::FailedWrite: + [[fallthrough]]; + case OpType::FailedWrite2: + [[fallthrough]]; + case OpType::FailedWrite3: + applyReadWriteOp(op); + break; + case OpType::InjectReadError: + [[fallthrough]]; + case OpType::InjectWriteError: + [[fallthrough]]; + case OpType::ClearReadErrorInject: + [[fallthrough]]; + case OpType::ClearWriteErrorInject: + applyInjectOp(op); + break; + default: + ceph_abort_msg("Unrecognised Op"); + break; + } +} - case OpType::READ: - { - start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1); - op_info->rop.read(op.offset1 * block_size, - op.length1 * block_size, - &op_info->bl1, nullptr); - auto read_cb = [this, op_info] (boost::system::error_code ec, - version_t ver, - bufferlist bl) { - ceph_assert(ec == boost::system::errc::success); - ceph_assert(db->validate(op_info->bl1, - op_info->offset1, - op_info->length1)); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->rop, 0, nullptr, read_cb); - num_io++; +void RadosIo::applyReadWriteOp(IoOp& op) { + auto applyReadOp = [this]<OpType opType, int N>( + ReadWriteOp<opType, N> readOp) { + auto op_info = + std::make_shared<AsyncOpInfo<N>>(readOp.offset, readOp.length); + + for (int i = 0; i < N; i++) { + op_info->rop.read(readOp.offset[i] * block_size, + readOp.length[i] * block_size, &op_info->bufferlist[i], + nullptr); } - break; + auto read_cb = [this, op_info](boost::system::error_code ec, version_t ver, + bufferlist bl) { + ceph_assert(ec == boost::system::errc::success); + for (int i = 0; i < N; i++) { + ceph_assert(db->validate(op_info->bufferlist[i], op_info->offset[i], + op_info->length[i])); + } + finish_io(); + }; + librados::async_operate(asio, io, oid, &op_info->rop, 0, nullptr, read_cb); + num_io++; + }; - case OpType::READ2: - { - start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, - op.length1, - op.offset2, - op.length2); - - op_info->rop.read(op.offset1 * block_size, - op.length1 * block_size, - &op_info->bl1, nullptr); - op_info->rop.read(op.offset2 * block_size, - op.length2 * block_size, - &op_info->bl2, nullptr); - auto read2_cb = [this, op_info] (boost::system::error_code ec, - version_t ver, - bufferlist bl) { - ceph_assert(ec == boost::system::errc::success); - ceph_assert(db->validate(op_info->bl1, - op_info->offset1, - op_info->length1)); - ceph_assert(db->validate(op_info->bl2, - op_info->offset2, - op_info->length2)); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->rop, 0, nullptr, read2_cb); - num_io++; + auto applyWriteOp = [this]<OpType opType, int N>( + ReadWriteOp<opType, N> writeOp) { + auto op_info = + std::make_shared<AsyncOpInfo<N>>(writeOp.offset, writeOp.length); + for (int i = 0; i < N; i++) { + op_info->bufferlist[i] = + db->generate_data(writeOp.offset[i], writeOp.length[i]); + op_info->wop.write(writeOp.offset[i] * block_size, + op_info->bufferlist[i]); } - break; + auto write_cb = [this](boost::system::error_code ec, version_t ver) { + ceph_assert(ec == boost::system::errc::success); + finish_io(); + }; + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, write_cb); + num_io++; + }; - case OpType::READ3: - { - start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1, - op.offset2, op.length2, - op.offset3, op.length3); - op_info->rop.read(op.offset1 * block_size, - op.length1 * block_size, - &op_info->bl1, nullptr); - op_info->rop.read(op.offset2 * block_size, - op.length2 * block_size, - &op_info->bl2, nullptr); - op_info->rop.read(op.offset3 * block_size, - op.length3 * block_size, - &op_info->bl3, nullptr); - auto read3_cb = [this, op_info] (boost::system::error_code ec, - version_t ver, - bufferlist bl) { - ceph_assert(ec == boost::system::errc::success); - ceph_assert(db->validate(op_info->bl1, - op_info->offset1, - op_info->length1)); - ceph_assert(db->validate(op_info->bl2, - op_info->offset2, - op_info->length2)); - ceph_assert(db->validate(op_info->bl3, - op_info->offset3, - op_info->length3)); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->rop, 0, nullptr, read3_cb); - num_io++; + auto applyFailedWriteOp = [this]<OpType opType, int N>( + ReadWriteOp<opType, N> writeOp) { + auto op_info = + std::make_shared<AsyncOpInfo<N>>(writeOp.offset, writeOp.length); + for (int i = 0; i < N; i++) { + op_info->bufferlist[i] = + db->generate_data(writeOp.offset[i], writeOp.length[i]); + op_info->wop.write(writeOp.offset[i] * block_size, + op_info->bufferlist[i]); } - break; + auto write_cb = [this, writeOp](boost::system::error_code ec, + version_t ver) { + ceph_assert(ec != boost::system::errc::success); + finish_io(); + }; + librados::async_operate(asio, io, oid, &op_info->wop, 0, nullptr, write_cb); + num_io++; + }; - case OpType::WRITE: - { + switch (op.getOpType()) { + case OpType::Read: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1); - op_info->bl1 = db->generate_data(op.offset1, op.length1); - - op_info->wop.write(op.offset1 * block_size, op_info->bl1); - auto write_cb = [this] (boost::system::error_code ec, - version_t ver) { - ceph_assert(ec == boost::system::errc::success); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, write_cb); - num_io++; + SingleReadOp& readOp = static_cast<SingleReadOp&>(op); + applyReadOp(readOp); + break; } - break; - - case OpType::WRITE2: - { + case OpType::Read2: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1, - op.offset2, op.length2); - op_info->bl1 = db->generate_data(op.offset1, op.length1); - op_info->bl2 = db->generate_data(op.offset2, op.length2); - op_info->wop.write(op.offset1 * block_size, op_info->bl1); - op_info->wop.write(op.offset2 * block_size, op_info->bl2); - auto write2_cb = [this] (boost::system::error_code ec, - version_t ver) { - ceph_assert(ec == boost::system::errc::success); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, write2_cb); - num_io++; + DoubleReadOp& readOp = static_cast<DoubleReadOp&>(op); + applyReadOp(readOp); + break; + } + case OpType::Read3: { + start_io(); + TripleReadOp& readOp = static_cast<TripleReadOp&>(op); + applyReadOp(readOp); + break; + } + case OpType::Write: { + start_io(); + SingleWriteOp& writeOp = static_cast<SingleWriteOp&>(op); + applyWriteOp(writeOp); + break; + } + case OpType::Write2: { + start_io(); + DoubleWriteOp& writeOp = static_cast<DoubleWriteOp&>(op); + applyWriteOp(writeOp); + break; + } + case OpType::Write3: { + start_io(); + TripleWriteOp& writeOp = static_cast<TripleWriteOp&>(op); + applyWriteOp(writeOp); + break; } - break; - case OpType::WRITE3: - { + case OpType::FailedWrite: { start_io(); - op_info = std::make_shared<AsyncOpInfo>(op.offset1, op.length1, - op.offset2, op.length2, - op.offset3, op.length3); - op_info->bl1 = db->generate_data(op.offset1, op.length1); - op_info->bl2 = db->generate_data(op.offset2, op.length2); - op_info->bl3 = db->generate_data(op.offset3, op.length3); - op_info->wop.write(op.offset1 * block_size, op_info->bl1); - op_info->wop.write(op.offset2 * block_size, op_info->bl2); - op_info->wop.write(op.offset3 * block_size, op_info->bl3); - auto write3_cb = [this] (boost::system::error_code ec, - version_t ver) { - ceph_assert(ec == boost::system::errc::success); - finish_io(); - }; - librados::async_operate(asio, io, oid, - &op_info->wop, 0, nullptr, write3_cb); - num_io++; + SingleFailedWriteOp& writeOp = static_cast<SingleFailedWriteOp&>(op); + applyFailedWriteOp(writeOp); + break; + } + case OpType::FailedWrite2: { + start_io(); + DoubleFailedWriteOp& writeOp = static_cast<DoubleFailedWriteOp&>(op); + applyFailedWriteOp(writeOp); + break; + } + case OpType::FailedWrite3: { + start_io(); + TripleFailedWriteOp& writeOp = static_cast<TripleFailedWriteOp&>(op); + applyFailedWriteOp(writeOp); + break; } - break; - default: - break; + default: + ceph_abort_msg( + fmt::format("Unsupported Read/Write operation ({})", op.getOpType())); + break; } } + +void RadosIo::applyInjectOp(IoOp& op) { + bufferlist osdmap_inbl, inject_inbl, osdmap_outbl, inject_outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + std::ostringstream oss; + + int osd = -1; + std::vector<int> shard_order; + + ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, get_oid(), ""}; + encode_json("OSDMapRequest", osdMapRequest, formatter.get()); + formatter->flush(oss); + int rc = rados.mon_command(oss.str(), osdmap_inbl, &osdmap_outbl, nullptr); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(osdmap_outbl.c_str(), osdmap_outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDMapReply reply; + reply.decode_json(&p); + + osd = reply.acting_primary; + shard_order = reply.acting; + + switch (op.getOpType()) { + case OpType::InjectReadError: { + InjectReadErrorOp& errorOp = static_cast<InjectReadErrorOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECErrorRequest<InjectOpType::ReadEIO> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + encode_json("InjectECErrorRequest", injectErrorRequest, + formatter.get()); + } else if (errorOp.type == 1) { + ceph::messaging::osd::InjectECErrorRequest< + InjectOpType::ReadMissingShard> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + encode_json("InjectECErrorRequest", injectErrorRequest, + formatter.get()); + } else { + ceph_abort_msg("Unsupported inject type"); + } + formatter->flush(oss); + int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl, + nullptr); + ceph_assert(rc == 0); + break; + } + case OpType::InjectWriteError: { + InjectWriteErrorOp& errorOp = static_cast<InjectWriteErrorOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECErrorRequest< + InjectOpType::WriteFailAndRollback> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + encode_json("InjectECErrorRequest", injectErrorRequest, + formatter.get()); + } else if (errorOp.type == 3) { + ceph::messaging::osd::InjectECErrorRequest<InjectOpType::WriteOSDAbort> + injectErrorRequest{pool, oid, errorOp.shard, + errorOp.type, errorOp.when, errorOp.duration}; + encode_json("InjectECErrorRequest", injectErrorRequest, + formatter.get()); + + // This inject is sent directly to the shard we want to inject the error + // on + osd = shard_order[errorOp.shard]; + } else { + ceph_abort("Unsupported inject type"); + } + + formatter->flush(oss); + int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl, + nullptr); + ceph_assert(rc == 0); + break; + } + case OpType::ClearReadErrorInject: { + ClearReadErrorInjectOp& errorOp = + static_cast<ClearReadErrorInjectOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECClearErrorRequest<InjectOpType::ReadEIO> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + encode_json("InjectECClearErrorRequest", clearErrorInject, + formatter.get()); + } else if (errorOp.type == 1) { + ceph::messaging::osd::InjectECClearErrorRequest< + InjectOpType::ReadMissingShard> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + encode_json("InjectECClearErrorRequest", clearErrorInject, + formatter.get()); + } else { + ceph_abort("Unsupported inject type"); + } + + formatter->flush(oss); + int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl, + nullptr); + ceph_assert(rc == 0); + break; + } + case OpType::ClearWriteErrorInject: { + ClearReadErrorInjectOp& errorOp = + static_cast<ClearReadErrorInjectOp&>(op); + + if (errorOp.type == 0) { + ceph::messaging::osd::InjectECClearErrorRequest< + InjectOpType::WriteFailAndRollback> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + encode_json("InjectECClearErrorRequest", clearErrorInject, + formatter.get()); + } else if (errorOp.type == 3) { + ceph::messaging::osd::InjectECClearErrorRequest< + InjectOpType::WriteOSDAbort> + clearErrorInject{pool, oid, errorOp.shard, errorOp.type}; + encode_json("InjectECClearErrorRequest", clearErrorInject, + formatter.get()); + } else { + ceph_abort("Unsupported inject type"); + } + + formatter->flush(oss); + int rc = rados.osd_command(osd, oss.str(), inject_inbl, &inject_outbl, + nullptr); + ceph_assert(rc == 0); + break; + } + default: + ceph_abort_msg( + fmt::format("Unsupported inject operation ({})", op.getOpType())); + break; + } +}
\ No newline at end of file diff --git a/src/common/io_exerciser/RadosIo.h b/src/common/io_exerciser/RadosIo.h index 179c5bba3ae..a5c66ad4768 100644 --- a/src/common/io_exerciser/RadosIo.h +++ b/src/common/io_exerciser/RadosIo.h @@ -10,71 +10,65 @@ * in the object. Uses DataBuffer to create and validate * data buffers. When there are not barrier I/Os this may * issue multiple async I/Os in parallel. - * + * */ namespace ceph { - namespace io_exerciser { - namespace data_generation { - class DataGenerator; - } - - class RadosIo: public Model { - protected: - librados::Rados& rados; - boost::asio::io_context& asio; - std::unique_ptr<ObjectModel> om; - std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db; - std::string pool; - int threads; - ceph::mutex& lock; - ceph::condition_variable& cond; - librados::IoCtx io; - int outstanding_io; +namespace io_exerciser { +namespace data_generation { +class DataGenerator; +} + +class RadosIo : public Model { + protected: + librados::Rados& rados; + boost::asio::io_context& asio; + std::unique_ptr<ObjectModel> om; + std::unique_ptr<ceph::io_exerciser::data_generation::DataGenerator> db; + std::string pool; + std::optional<std::vector<int>> cached_shard_order; + int threads; + ceph::mutex& lock; + ceph::condition_variable& cond; + librados::IoCtx io; + int outstanding_io; + + void start_io(); + void finish_io(); + void wait_for_io(int count); + + public: + RadosIo(librados::Rados& rados, boost::asio::io_context& asio, + const std::string& pool, const std::string& oid, + const std::optional<std::vector<int>>& cached_shard_order, + uint64_t block_size, int seed, int threads, ceph::mutex& lock, + ceph::condition_variable& cond); - void start_io(); - void finish_io(); - void wait_for_io(int count); - - public: - RadosIo(librados::Rados& rados, - boost::asio::io_context& asio, - const std::string& pool, - const std::string& oid, - uint64_t block_size, - int seed, - int threads, - ceph::mutex& lock, - ceph::condition_variable& cond); + ~RadosIo(); - ~RadosIo(); + void allow_ec_overwrites(bool allow); - void allow_ec_overwrites(bool allow); + template <int N> + class AsyncOpInfo { + public: + librados::ObjectReadOperation rop; + librados::ObjectWriteOperation wop; + std::array<ceph::bufferlist, N> bufferlist; + std::array<uint64_t, N> offset; + std::array<uint64_t, N> length; - class AsyncOpInfo { - public: - librados::ObjectReadOperation rop; - librados::ObjectWriteOperation wop; - ceph::buffer::list bl1; - ceph::buffer::list bl2; - ceph::buffer::list bl3; - uint64_t offset1; - uint64_t length1; - uint64_t offset2; - uint64_t length2; - uint64_t offset3; - uint64_t length3; + AsyncOpInfo(const std::array<uint64_t, N>& offset = {}, + const std::array<uint64_t, N>& length = {}); + ~AsyncOpInfo() = default; + }; - AsyncOpInfo(uint64_t offset1 = 0, uint64_t length1 = 0, - uint64_t offset2 = 0, uint64_t length2 = 0, - uint64_t offset3 = 0, uint64_t length3 = 0 ); - ~AsyncOpInfo() = default; - }; + // Must be called with lock held + bool readyForIoOp(IoOp& op); + void applyIoOp(IoOp& op); - // Must be called with lock held - bool readyForIoOp(IoOp& op); - - void applyIoOp(IoOp& op); - }; - } -}
\ No newline at end of file + private: + void applyReadWriteOp(IoOp& op); + void applyInjectOp(IoOp& op); +}; +} // namespace io_exerciser +} // namespace ceph
\ No newline at end of file diff --git a/src/common/json/BalancerStructures.cc b/src/common/json/BalancerStructures.cc new file mode 100644 index 00000000000..48dfb843761 --- /dev/null +++ b/src/common/json/BalancerStructures.cc @@ -0,0 +1,38 @@ +#include "BalancerStructures.h" + +#include "common/ceph_json.h" + +using namespace ceph::messaging::balancer; + +void BalancerOffRequest::dump(Formatter* f) const { + encode_json("prefix", "balancer off", f); +} + +void BalancerOffRequest::decode_json(JSONObj* obj) {} + +void BalancerStatusRequest::dump(Formatter* f) const { + encode_json("prefix", "balancer status", f); +} + +void BalancerStatusRequest::decode_json(JSONObj* obj) {} + +void BalancerStatusReply::dump(Formatter* f) const { + encode_json("active", active, f); + encode_json("last_optimization_duration", last_optimization_duration, f); + encode_json("last_optimization_started", last_optimization_started, f); + encode_json("mode", mode, f); + encode_json("no_optimization_needed", no_optimization_needed, f); + encode_json("optimize_result", optimize_result, f); +} + +void BalancerStatusReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("active", active, obj); + JSONDecoder::decode_json("last_optimization_duration", + last_optimization_duration, obj); + JSONDecoder::decode_json("last_optimization_started", + last_optimization_started, obj); + JSONDecoder::decode_json("mode", mode, obj); + JSONDecoder::decode_json("no_optimization_needed", no_optimization_needed, + obj); + JSONDecoder::decode_json("optimize_result", optimize_result, obj); +}
\ No newline at end of file diff --git a/src/common/json/BalancerStructures.h b/src/common/json/BalancerStructures.h new file mode 100644 index 00000000000..bbf5c748eb3 --- /dev/null +++ b/src/common/json/BalancerStructures.h @@ -0,0 +1,35 @@ +#pragma once + +#include <string> + +#include "include/types.h" + +class JSONObj; + +namespace ceph { +namespace messaging { +namespace balancer { +struct BalancerOffRequest { + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct BalancerStatusRequest { + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct BalancerStatusReply { + bool active; + std::string last_optimization_duration; + std::string last_optimization_started; + std::string mode; + bool no_optimization_needed; + std::string optimize_result; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; +} // namespace balancer +} // namespace messaging +} // namespace ceph
\ No newline at end of file diff --git a/src/common/json/CMakeLists.txt b/src/common/json/CMakeLists.txt new file mode 100644 index 00000000000..1497daf93db --- /dev/null +++ b/src/common/json/CMakeLists.txt @@ -0,0 +1,4 @@ +add_library(json_structures STATIC + BalancerStructures.cc ConfigStructures.cc OSDStructures.cc) + + target_link_libraries(json_structures global)
\ No newline at end of file diff --git a/src/common/json/ConfigStructures.cc b/src/common/json/ConfigStructures.cc new file mode 100644 index 00000000000..651278d002a --- /dev/null +++ b/src/common/json/ConfigStructures.cc @@ -0,0 +1,20 @@ +#include "ConfigStructures.h" + +#include "common/ceph_json.h" + +using namespace ceph::messaging::config; + +void ConfigSetRequest::dump(Formatter* f) const { + encode_json("prefix", "config set", f); + encode_json("who", who, f); + encode_json("name", name, f); + encode_json("value", value, f); + encode_json("force", force, f); +} + +void ConfigSetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("who", who, obj); + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("value", value, obj); + JSONDecoder::decode_json("force", force, obj); +}
\ No newline at end of file diff --git a/src/common/json/ConfigStructures.h b/src/common/json/ConfigStructures.h new file mode 100644 index 00000000000..554229d75f4 --- /dev/null +++ b/src/common/json/ConfigStructures.h @@ -0,0 +1,24 @@ +#pragma once + +#include <optional> +#include <string> + +#include "include/types.h" + +class JSONObj; + +namespace ceph { +namespace messaging { +namespace config { +struct ConfigSetRequest { + std::string who; + std::string name; + std::string value; + std::optional<bool> force; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; +} // namespace config +} // namespace messaging +} // namespace ceph
\ No newline at end of file diff --git a/src/common/json/OSDStructures.cc b/src/common/json/OSDStructures.cc new file mode 100644 index 00000000000..aaac5f6e169 --- /dev/null +++ b/src/common/json/OSDStructures.cc @@ -0,0 +1,150 @@ +#include "OSDStructures.h" + +#include "common/ceph_json.h" +#include "common/io_exerciser/OpType.h" + +using namespace ceph::messaging::osd; + +void OSDMapRequest::dump(Formatter* f) const { + encode_json("prefix", "osd map", f); + encode_json("pool", pool, f); + encode_json("object", object, f); + encode_json("nspace", nspace, f); + encode_json("format", format, f); +} + +void OSDMapRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("object", object, obj); + JSONDecoder::decode_json("nspace", nspace, obj); + JSONDecoder::decode_json("format", format, obj); +} + +void OSDMapReply::dump(Formatter* f) const { + encode_json("epoch", epoch, f); + encode_json("pool", pool, f); + encode_json("pool_id", pool_id, f); + encode_json("objname", objname, f); + encode_json("raw_pgid", raw_pgid, f); + encode_json("pgid", pgid, f); + encode_json("up", up, f); + encode_json("up_primary", up_primary, f); + encode_json("acting", acting, f); + encode_json("acting_primary", acting_primary, f); +} + +void OSDMapReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("epoch", epoch, obj); + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("pool_id", pool_id, obj); + JSONDecoder::decode_json("objname", objname, obj); + JSONDecoder::decode_json("raw_pgid", raw_pgid, obj); + JSONDecoder::decode_json("pgid", pgid, obj); + JSONDecoder::decode_json("up", up, obj); + JSONDecoder::decode_json("up_primary", up_primary, obj); + JSONDecoder::decode_json("acting", acting, obj); + JSONDecoder::decode_json("acting_primary", acting_primary, obj); +} + +void OSDPoolGetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd pool get", f); + encode_json("pool", pool, f); + encode_json("var", var, f); + encode_json("format", format, f); +} + +void OSDPoolGetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("var", var, obj); + JSONDecoder::decode_json("format", format, obj); +} + +void OSDPoolGetReply::dump(Formatter* f) const { + encode_json("erasure_code_profile", erasure_code_profile, f); +} + +void OSDPoolGetReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("erasure_code_profile", erasure_code_profile, obj); +} + +void OSDECProfileGetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd pool get", f); + encode_json("name", name, f); + encode_json("format", format, f); +} + +void OSDECProfileGetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("format", format, obj); +} + +void OSDECProfileGetReply::dump(Formatter* f) const { + encode_json("crush-device-class", crush_device_class, f); + encode_json("crush-failure-domain", crush_failure_domain, f); + encode_json("crush-num-failure-domains", crush_num_failure_domains, f); + encode_json("crush-osds-per-failure-domain", crush_osds_per_failure_domain, + f); + encode_json("crush-root", crush_root, f); + encode_json("jerasure-per-chunk-alignment", jerasure_per_chunk_alignment, f); + encode_json("k", k, f); + encode_json("m", m, f); + encode_json("plugin", plugin, f); + encode_json("technique", technique, f); + encode_json("w", w, f); +} + +void OSDECProfileGetReply::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("crush-device-class", crush_device_class, obj); + JSONDecoder::decode_json("crush-failure-domain", crush_failure_domain, obj); + JSONDecoder::decode_json("crush-num-failure-domains", + crush_num_failure_domains, obj); + JSONDecoder::decode_json("crush-osds-per-failure-domain", + crush_osds_per_failure_domain, obj); + JSONDecoder::decode_json("crush-root", crush_root, obj); + JSONDecoder::decode_json("jerasure-per-chunk-alignment", + jerasure_per_chunk_alignment, obj); + JSONDecoder::decode_json("k", k, obj); + JSONDecoder::decode_json("m", m, obj); + JSONDecoder::decode_json("plugin", plugin, obj); + JSONDecoder::decode_json("technique", technique, obj); + JSONDecoder::decode_json("w", w, obj); +} + +void OSDECProfileSetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd erasure-code-profile set", f); + encode_json("name", name, f); + encode_json("profile", profile, f); +} + +void OSDECProfileSetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("name", name, obj); + JSONDecoder::decode_json("profile", profile, obj); +} + +void OSDECPoolCreateRequest::dump(Formatter* f) const { + encode_json("prefix", "osd pool create", f); + encode_json("pool", pool, f); + encode_json("pool_type", pool_type, f); + encode_json("pg_num", pg_num, f); + encode_json("pgp_num", pgp_num, f); + encode_json("erasure_code_profile", erasure_code_profile, f); +} + +void OSDECPoolCreateRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("pool_type", pool_type, obj); + JSONDecoder::decode_json("pg_num", pg_num, obj); + JSONDecoder::decode_json("pgp_num", pgp_num, obj); + JSONDecoder::decode_json("erasure_code_profile", erasure_code_profile, obj); +} + +void OSDSetRequest::dump(Formatter* f) const { + encode_json("prefix", "osd set", f); + encode_json("key", key, f); + encode_json("yes_i_really_mean_it", yes_i_really_mean_it, f); +} + +void OSDSetRequest::decode_json(JSONObj* obj) { + JSONDecoder::decode_json("key", key, obj); + JSONDecoder::decode_json("yes_i_really_mean_it", yes_i_really_mean_it, obj); +}
\ No newline at end of file diff --git a/src/common/json/OSDStructures.h b/src/common/json/OSDStructures.h new file mode 100644 index 00000000000..3e4528a099f --- /dev/null +++ b/src/common/json/OSDStructures.h @@ -0,0 +1,189 @@ +#pragma once + +#include <memory> +#include <string> +#include <vector> + +#include "common/ceph_json.h" +#include "common/io_exerciser/OpType.h" +#include "include/types.h" + +class JSONObj; + +namespace ceph { +namespace messaging { +namespace osd { +struct OSDMapRequest { + std::string pool; + std::string object; + std::string nspace; + std::string format = "json"; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDMapReply { + epoch_t epoch; + std::string pool; + uint64_t pool_id; + std::string objname; + std::string raw_pgid; + std::string pgid; + std::vector<int> up; + int up_primary; + std::vector<int> acting; + int acting_primary; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDPoolGetRequest { + std::string pool; + std::string var = "erasure_code_profile"; + std::string format = "json"; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDPoolGetReply { + std::string erasure_code_profile; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECProfileGetRequest { + std::string name; + std::string format = "json"; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECProfileGetReply { + std::string crush_device_class; + std::string crush_failure_domain; + int crush_num_failure_domains; + int crush_osds_per_failure_domain; + std::string crush_root; + bool jerasure_per_chunk_alignment; + int k; + int m; + std::string plugin; + std::string technique; + std::string w; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECProfileSetRequest { + std::string name; + std::vector<std::string> profile; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDECPoolCreateRequest { + std::string pool; + std::string pool_type; + int pg_num; + int pgp_num; + std::string erasure_code_profile; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +struct OSDSetRequest { + std::string key; + std::optional<bool> yes_i_really_mean_it = std::nullopt; + + void dump(Formatter* f) const; + void decode_json(JSONObj* obj); +}; + +// These structures are sent directly to the relevant OSD +// rather than the monitor +template <io_exerciser::InjectOpType op_type> +struct InjectECErrorRequest { + std::string pool; + std::string objname; + int shardid; + std::optional<uint64_t> type; + std::optional<uint64_t> when; + std::optional<uint64_t> duration; + + void dump(Formatter* f) const { + switch (op_type) { + case io_exerciser::InjectOpType::ReadEIO: + [[fallthrough]]; + case io_exerciser::InjectOpType::ReadMissingShard: + ::encode_json("prefix", "injectecreaderr", f); + break; + case io_exerciser::InjectOpType::WriteFailAndRollback: + [[fallthrough]]; + case io_exerciser::InjectOpType::WriteOSDAbort: + ::encode_json("prefix", "injectecwriteerr", f); + break; + default: + ceph_abort_msg("Unsupported Inject Type"); + } + ::encode_json("pool", pool, f); + ::encode_json("objname", objname, f); + ::encode_json("shardid", shardid, f); + ::encode_json("type", type, f); + ::encode_json("when", when, f); + ::encode_json("duration", duration, f); + } + void decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("objname", objname, obj); + JSONDecoder::decode_json("shardid", shardid, obj); + JSONDecoder::decode_json("type", type, obj); + JSONDecoder::decode_json("when", when, obj); + JSONDecoder::decode_json("duration", duration, obj); + } +}; + +template <io_exerciser::InjectOpType op_type> +struct InjectECClearErrorRequest { + std::string pool; + std::string objname; + int shardid; + std::optional<uint64_t> type; + + void dump(Formatter* f) const { + switch (op_type) { + case io_exerciser::InjectOpType::ReadEIO: + [[fallthrough]]; + case io_exerciser::InjectOpType::ReadMissingShard: + ::encode_json("prefix", "injectecclearreaderr", f); + break; + case io_exerciser::InjectOpType::WriteFailAndRollback: + [[fallthrough]]; + case io_exerciser::InjectOpType::WriteOSDAbort: + ::encode_json("prefix", "injectecclearwriteerr", f); + break; + default: + ceph_abort_msg("Unsupported Inject Type"); + } + ::encode_json("pool", pool, f); + ::encode_json("objname", objname, f); + ::encode_json("shardid", shardid, f); + ::encode_json("type", type, f); + } + void decode_json(JSONObj* obj) { + JSONDecoder::decode_json("pool", pool, obj); + JSONDecoder::decode_json("objname", objname, obj); + JSONDecoder::decode_json("shardid", shardid, obj); + JSONDecoder::decode_json("type", type, obj); + } +}; +} // namespace osd +} // namespace messaging +} // namespace ceph
\ No newline at end of file diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in index abef7483bcf..132a4a09e89 100644 --- a/src/common/options/crimson.yaml.in +++ b/src/common/options/crimson.yaml.in @@ -6,7 +6,7 @@ options: type: str level: advanced desc: backend type for a Crimson OSD (e.g seastore or bluestore) - default: seastore + default: bluestore enum_values: - bluestore - seastore diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index 4cbc079a215..03a53cd7cea 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1745,4 +1745,4 @@ options: default: 16 services: - mds - min: 8 + min: 4 diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in index ab1634bc154..1307030e3fb 100644 --- a/src/common/options/mon.yaml.in +++ b/src/common/options/mon.yaml.in @@ -91,6 +91,13 @@ options: default: 1000 services: - mon +- name: mon_nvmeofgw_delete_grace + type: secs + level: advanced + desc: Issue NVMEOF_GATEWAY_DELETING health warning after this amount of time has elapsed + default: 15_min + services: + - mon - name: mon_mgr_inactive_grace type: int level: advanced diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc index d125d7171e0..a0629a15686 100644 --- a/src/common/pick_address.cc +++ b/src/common/pick_address.cc @@ -642,17 +642,24 @@ int get_iface_numa_node( bool is_addr_in_subnet( CephContext *cct, const std::string &networks, - const std::string &addr) + const entity_addr_t &addr) { const auto nets = get_str_list(networks); ceph_assert(!nets.empty()); - unsigned ipv = CEPH_PICK_ADDRESS_IPV4; - struct sockaddr_in public_addr; - public_addr.sin_family = AF_INET; - - if(inet_pton(AF_INET, addr.c_str(), &public_addr.sin_addr) != 1) { - lderr(cct) << "unable to convert chosen address to string: " << addr << dendl; + struct sockaddr_in6 public_addr6; + struct sockaddr_in public_addr4; + + if (addr.is_ipv4() && + inet_pton(AF_INET, addr.ip_only_to_str().c_str(), &public_addr4.sin_addr) == 1) { + public_addr4.sin_family = AF_INET; + } else if (addr.is_ipv6() && + inet_pton(AF_INET6, addr.ip_only_to_str().c_str(), &public_addr6.sin6_addr) == 1) { + public_addr6.sin6_family = AF_INET6; + ipv = CEPH_PICK_ADDRESS_IPV6; + } else { + std::string_view addr_type = addr.is_ipv4() ? "IPv4" : "IPv6"; + lderr(cct) << "IP address " << addr << " is not parseable as " << addr_type << dendl; return false; } @@ -660,10 +667,16 @@ bool is_addr_in_subnet( struct ifaddrs ifa; memset(&ifa, 0, sizeof(ifa)); ifa.ifa_next = nullptr; - ifa.ifa_addr = (struct sockaddr*)&public_addr; + if (addr.is_ipv4()) { + ifa.ifa_addr = (struct sockaddr*)&public_addr4; + } else if (addr.is_ipv6()) { + ifa.ifa_addr = (struct sockaddr*)&public_addr6; + } + if(matches_with_net(cct, ifa, net, ipv)) { return true; } } + lderr(cct) << "address " << addr << " is not in networks '" << networks << "'" << dendl; return false; } diff --git a/src/common/pick_address.h b/src/common/pick_address.h index 40575d7d155..c28a6037ded 100644 --- a/src/common/pick_address.h +++ b/src/common/pick_address.h @@ -98,6 +98,6 @@ int get_iface_numa_node( bool is_addr_in_subnet( CephContext *cct, const std::string &networks, - const std::string &addr); + const entity_addr_t &addr); #endif diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h index 92d99d332c4..0d73658e709 100644 --- a/src/crimson/common/shared_lru.h +++ b/src/crimson/common/shared_lru.h @@ -25,12 +25,17 @@ class SharedLRU { SimpleLRU<K, shared_ptr_t, false> cache; std::map<K, std::pair<weak_ptr_t, V*>> weak_refs; + // Once all of the shared pointers are destoryed, + // erase the tracked object from the weak_ref map + // before actually destorying it struct Deleter { - SharedLRU<K,V>* cache; + SharedLRU<K,V>* shared_lru_ptr; const K key; - void operator()(V* ptr) { - cache->_erase_weak(key); - delete ptr; + void operator()(V* value_ptr) { + if (shared_lru_ptr) { + shared_lru_ptr->_erase_weak(key); + } + delete value_ptr; } }; void _erase_weak(const K& key) { @@ -42,9 +47,19 @@ public: {} ~SharedLRU() { cache.clear(); + // initially, we were assuming that no pointer obtained from SharedLRU // can outlive the lru itself. However, since going with the interruption // concept for handling shutdowns, this is no longer valid. + // Moreover, before clearing weak_refs, invalidate each deleter + // cache pointer as this SharedLRU is being destoryed. + for (const auto& [key, value] : weak_refs) { + shared_ptr_t val; + val = value.first.lock(); + auto this_deleter = get_deleter<Deleter>(val); + this_deleter->shared_lru_ptr = nullptr; + } + weak_refs.clear(); } /** diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc index f390823a8a0..db6decd84f9 100644 --- a/src/crimson/os/alienstore/alien_store.cc +++ b/src/crimson/os/alienstore/alien_store.cc @@ -141,7 +141,8 @@ seastar::future<> AlienStore::stop() AlienStore::base_errorator::future<bool> AlienStore::exists( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { return op_gates.simple_dispatch("exists", [=, this] { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] { @@ -212,7 +213,8 @@ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> AlienStore::list_objects(CollectionRef ch, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const + uint64_t limit, + uint32_t op_flags) const { logger().debug("{}", __func__); assert(tp); @@ -348,7 +350,8 @@ AlienStore::readv(CollectionRef ch, AlienStore::get_attr_errorator::future<ceph::bufferlist> AlienStore::get_attr(CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { logger().debug("{}", __func__); assert(tp); @@ -376,7 +379,8 @@ AlienStore::get_attr(CollectionRef ch, AlienStore::get_attrs_ertr::future<AlienStore::attrs_t> AlienStore::get_attrs(CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { logger().debug("{}", __func__); assert(tp); @@ -397,7 +401,8 @@ AlienStore::get_attrs(CollectionRef ch, auto AlienStore::omap_get_values(CollectionRef ch, const ghobject_t& oid, - const set<string>& keys) + const set<string>& keys, + uint32_t op_flags) -> read_errorator::future<omap_values_t> { logger().debug("{}", __func__); @@ -421,7 +426,8 @@ auto AlienStore::omap_get_values(CollectionRef ch, auto AlienStore::omap_get_values(CollectionRef ch, const ghobject_t &oid, - const std::optional<string> &start) + const std::optional<string> &start, + uint32_t op_flags) -> read_errorator::future<std::tuple<bool, omap_values_t>> { logger().debug("{} with_start", __func__); @@ -429,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch, return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] { auto c = static_cast<AlienCollection*>(ch.get()); - return store->omap_get_values(c->collection, oid, start, - reinterpret_cast<map<string, bufferlist>*>(&values)); + return store->omap_iterate( + c->collection, oid, + ObjectStore::omap_iter_seek_t{ + .seek_position = start.value_or(std::string{}), + // FIXME: classical OSDs begins iteration from LOWER_BOUND + // (or UPPER_BOUND if filter_prefix > start). However, these + // bits are not implemented yet + .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND + }, + [&values] + (std::string_view key, std::string_view value) mutable { + values[std::string{key}].append(value); + // FIXME: there is limit on number of entries yet + return ObjectStore::omap_iter_ret_t::NEXT; + }); }).then([&values] (int r) -> read_errorator::future<std::tuple<bool, omap_values_t>> { if (r == -ENOENT) { @@ -578,7 +597,8 @@ unsigned AlienStore::get_max_attr_name_length() const seastar::future<struct stat> AlienStore::stat( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { assert(tp); return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) { @@ -604,7 +624,8 @@ seastar::future<std::string> AlienStore::get_default_device_class() } auto AlienStore::omap_get_header(CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) -> get_attr_errorator::future<ceph::bufferlist> { assert(tp); @@ -630,7 +651,8 @@ AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fie CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { assert(tp); return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) { diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h index 853585dac9c..1d39411450e 100644 --- a/src/crimson/os/alienstore/alien_store.h +++ b/src/crimson/os/alienstore/alien_store.h @@ -36,7 +36,8 @@ public: base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; read_errorator::future<ceph::bufferlist> read(CollectionRef c, const ghobject_t& oid, @@ -49,29 +50,36 @@ public: uint32_t op_flags = 0) final; - get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c, - const ghobject_t& oid, - std::string_view name) const final; - get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c, - const ghobject_t& oid) final; + get_attr_errorator::future<ceph::bufferlist> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name, + uint32_t op_flags = 0) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; /// Retrieves paged set of values > start (if present) read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; ///< @return <done, values> values.empty() iff done seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; seastar::future<CollectionRef> open_collection(const coll_t& cid) final; @@ -97,16 +105,19 @@ public: unsigned get_max_attr_name_length() const final; seastar::future<struct stat> stat( CollectionRef, - const ghobject_t&) final; + const ghobject_t&, + uint32_t op_flags = 0) final; seastar::future<std::string> get_default_device_class() final; get_attr_errorator::future<ceph::bufferlist> omap_get_header( CollectionRef, - const ghobject_t&) final; + const ghobject_t&, + uint32_t) final; read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( CollectionRef, const ghobject_t&, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags) final; FuturizedStore::Shard& get_sharded_store() final { return *this; diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc index a8bf514de15..41819fb5eb6 100644 --- a/src/crimson/os/cyanstore/cyan_store.cc +++ b/src/crimson/os/cyanstore/cyan_store.cc @@ -208,7 +208,8 @@ CyanStore::Shard::list_objects( CollectionRef ch, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const + uint64_t limit, + uint32_t op_flags) const { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {} {} {}", @@ -257,7 +258,8 @@ CyanStore::Shard::list_collections() CyanStore::Shard::base_errorator::future<bool> CyanStore::Shard::exists( CollectionRef ch, - const ghobject_t &oid) + const ghobject_t &oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); if (!c->exists) { @@ -333,7 +335,8 @@ CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> CyanStore::Shard::get_attr( CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {}", @@ -352,7 +355,8 @@ CyanStore::Shard::get_attr( CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t> CyanStore::Shard::get_attrs( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {}", @@ -367,7 +371,8 @@ CyanStore::Shard::get_attrs( auto CyanStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t& oid, - const omap_keys_t& keys) + const omap_keys_t& keys, + uint32_t op_flags) -> read_errorator::future<omap_values_t> { auto c = static_cast<Collection*>(ch.get()); @@ -388,7 +393,8 @@ auto CyanStore::Shard::omap_get_values( auto CyanStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional<string> &start) + const std::optional<string> &start, + uint32_t op_flags) -> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>> { auto c = static_cast<Collection*>(ch.get()); @@ -409,7 +415,8 @@ auto CyanStore::Shard::omap_get_values( auto CyanStore::Shard::omap_get_header( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) -> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> { auto c = static_cast<Collection*>(ch.get()); @@ -977,7 +984,8 @@ CyanStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); @@ -992,7 +1000,8 @@ CyanStore::Shard::fiemap( seastar::future<struct stat> CyanStore::Shard::stat( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); auto o = c->get_object(oid); diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h index e9394991bc2..1d481ef5829 100644 --- a/src/crimson/os/cyanstore/cyan_store.h +++ b/src/crimson/os/cyanstore/cyan_store.h @@ -34,11 +34,13 @@ public: seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; base_errorator::future<bool> exists( CollectionRef ch, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<ceph::bufferlist> read( CollectionRef c, @@ -56,33 +58,39 @@ public: get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const final; + std::string_view name, + uint32_t op_flags = 0) const final; get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; get_attr_errorator::future<ceph::bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; @@ -101,7 +109,8 @@ public: CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags) final; unsigned get_max_attr_name_length() const final; diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h index 51ef2331014..e7d4c8546de 100644 --- a/src/crimson/os/futurized_store.h +++ b/src/crimson/os/futurized_store.h @@ -54,7 +54,8 @@ public: virtual base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; using get_attr_errorator = crimson::errorator< crimson::ct_error::enoent, @@ -62,42 +63,49 @@ public: virtual get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const = 0; + std::string_view name, + uint32_t op_flags = 0) const = 0; using get_attrs_ertr = crimson::errorator< crimson::ct_error::enoent>; using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>; virtual get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; virtual seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; using omap_values_t = attrs_t; using omap_keys_t = std::set<std::string>; virtual read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) = 0; + const omap_keys_t& keys, + uint32_t op_flags = 0) = 0; using omap_values_paged_t = std::tuple<bool, omap_values_t>; virtual read_errorator::future<omap_values_paged_t> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) = 0; ///< @return <done, values> values.empty() only if done virtual get_attr_errorator::future<bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const = 0; + uint64_t limit, + uint32_t op_flags = 0) const = 0; virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0; @@ -153,7 +161,8 @@ public: CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) = 0; + uint64_t len, + uint32_t op_flags = 0) = 0; virtual unsigned get_max_attr_name_length() const = 0; }; diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc index 341c5c5524a..64e6749562e 100644 --- a/src/crimson/os/seastore/async_cleaner.cc +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -609,6 +609,7 @@ JournalTrimmerImpl::trim_alloc() return extent_callback->with_transaction_intr( Transaction::src_t::TRIM_ALLOC, "trim_alloc", + CACHE_HINT_NOCACHE, [this, FNAME](auto &t) { auto target = get_alloc_tail_target(); @@ -653,6 +654,7 @@ JournalTrimmerImpl::trim_dirty() return extent_callback->with_transaction_intr( Transaction::src_t::TRIM_DIRTY, "trim_dirty", + CACHE_HINT_NOCACHE, [this, FNAME](auto &t) { auto target = get_dirty_tail_target(); @@ -1125,6 +1127,7 @@ SegmentCleaner::do_reclaim_space( return extent_callback->with_transaction_intr( src, "clean_reclaim_space", + CACHE_HINT_NOCACHE, [this, &backref_extents, &pin_list, &reclaimed](auto &t) { return seastar::do_with( @@ -1240,6 +1243,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space() return extent_callback->with_transaction_intr( Transaction::src_t::READ, "retrieve_from_backref_tree", + CACHE_HINT_NOCACHE, [this, &weak_read_ret](auto &t) { return backref_manager.get_mappings( t, @@ -1506,6 +1510,7 @@ bool SegmentCleaner::check_usage() SpaceTrackerIRef tracker(space_tracker->make_empty()); extent_callback->with_transaction_weak( "check_usage", + CACHE_HINT_NOCACHE, [this, &tracker](auto &t) { return backref_manager.scan_mapped_space( t, @@ -1812,6 +1817,7 @@ bool RBMCleaner::check_usage() RBMSpaceTracker tracker(rbms); extent_callback->with_transaction_weak( "check_usage", + CACHE_HINT_NOCACHE, [this, &tracker, &rbms](auto &t) { return backref_manager.scan_mapped_space( t, diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h index 424247c5bdc..01ab44c4c7c 100644 --- a/src/crimson/os/seastore/async_cleaner.h +++ b/src/crimson/os/seastore/async_cleaner.h @@ -299,24 +299,29 @@ public: /// Creates empty transaction /// weak transaction should be type READ virtual TransactionRef create_transaction( - Transaction::src_t, const char *name, bool is_weak=false) = 0; + Transaction::src_t, + const char *name, + cache_hint_t cache_hint = CACHE_HINT_TOUCH, + bool is_weak=false) = 0; /// Creates empty transaction with interruptible context template <typename Func> auto with_transaction_intr( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, Func &&f) { return do_with_transaction_intr<Func, false>( - src, name, std::forward<Func>(f)); + src, name, cache_hint, std::forward<Func>(f)); } template <typename Func> auto with_transaction_weak( const char* name, + cache_hint_t cache_hint, Func &&f) { return do_with_transaction_intr<Func, true>( - Transaction::src_t::READ, name, std::forward<Func>(f) + Transaction::src_t::READ, name, cache_hint, std::forward<Func>(f) ).handle_error( crimson::ct_error::eagain::assert_failure{"unexpected eagain"}, crimson::ct_error::pass_further_all{} @@ -385,9 +390,10 @@ private: auto do_with_transaction_intr( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, Func &&f) { return seastar::do_with( - create_transaction(src, name, IsWeak), + create_transaction(src, name, cache_hint, IsWeak), [f=std::forward<Func>(f)](auto &ref_t) mutable { return with_trans_intr( *ref_t, diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 5898b9bad0a..86f816e1648 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1483,7 +1483,7 @@ record_t Cache::prepare_record( i->state = CachedExtent::extent_state_t::CLEAN; assert(i->is_logical()); i->clear_modified_region(); - touch_extent(*i, &trans_src); + touch_extent(*i, &trans_src, t.get_cache_hint()); DEBUGT("inplace rewrite ool block is commmitted -- {}", t, *i); } @@ -1513,7 +1513,7 @@ record_t Cache::prepare_record( if (i->is_dirty()) { add_to_dirty(i, &t_src); } else { - touch_extent(*i, &t_src); + touch_extent(*i, &t_src, t.get_cache_hint()); } alloc_delta.alloc_blk_ranges.emplace_back( @@ -1759,7 +1759,7 @@ void Cache::complete_commit( add_extent(i); assert(!i->is_dirty()); const auto t_src = t.get_src(); - touch_extent(*i, &t_src); + touch_extent(*i, &t_src, t.get_cache_hint()); epm.commit_space_used(i->get_paddr(), i->get_length()); // Note: commit extents and backref allocations in the same place @@ -2026,7 +2026,7 @@ Cache::replay_delta( [](CachedExtent &) {}, [this](CachedExtent &ext) { // replay is not included by the cache hit metrics - touch_extent(ext, nullptr); + touch_extent(ext, nullptr, CACHE_HINT_TOUCH); }, nullptr) : _get_extent_if_cached( diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index b2248ff37dd..a239b861726 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -124,6 +124,7 @@ public: TransactionRef create_transaction( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, bool is_weak) { LOG_PREFIX(Cache::create_transaction); @@ -137,7 +138,8 @@ public: [this](Transaction& t) { return on_transaction_destruct(t); }, - ++next_id + ++next_id, + cache_hint ); SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}", *ret, name, src, is_weak); @@ -284,7 +286,7 @@ public: SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}", t, type, offset, *ret); t.add_to_read_set(ret); - touch_extent(*ret, &t_src); + touch_extent(*ret, &t_src, t.get_cache_hint()); return ret->wait_io().then([ret] { return get_extent_if_cached_iertr::make_ready_future< CachedExtentRef>(ret); @@ -341,7 +343,7 @@ public: t, T::TYPE, offset, length); auto f = [&t, this, t_src](CachedExtent &ext) { t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent<T>( @@ -389,7 +391,7 @@ public: ++stats.access.s.load_absent; t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent<T>( @@ -487,7 +489,7 @@ public: ++access_stats.cache_lru; ++stats.access.s.cache_lru; } - touch_extent(*p_extent, &t_src); + touch_extent(*p_extent, &t_src, t.get_cache_hint()); } else { if (p_extent->is_dirty()) { ++access_stats.trans_dirty; @@ -834,7 +836,7 @@ private: t, type, offset, length, laddr); auto f = [&t, this, t_src](CachedExtent &ext) { t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent_by_type( @@ -876,7 +878,7 @@ private: ++stats.access.s.load_absent; t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent_by_type( @@ -1472,11 +1474,10 @@ private: /// Update lru for access to ref void touch_extent( CachedExtent &ext, - const Transaction::src_t* p_src) + const Transaction::src_t* p_src, + cache_hint_t hint) { - if (p_src && - is_background_transaction(*p_src) && - is_logical_type(ext.get_type())) { + if (hint == CACHE_HINT_NOCACHE && is_logical_type(ext.get_type())) { return; } if (ext.is_stable_clean() && !ext.is_placeholder()) { diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index 5b51083f344..6a866cb1f9b 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -408,6 +408,7 @@ SeaStore::Shard::mkfs_managers() return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "mkfs_seastore", + CACHE_HINT_TOUCH, [this](auto& t) { LOG_PREFIX(SeaStoreS::mkfs_managers); @@ -897,9 +898,10 @@ get_ranges(CollectionRef ch, seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> SeaStore::Shard::list_objects(CollectionRef ch, - const ghobject_t& start, - const ghobject_t& end, - uint64_t limit) const + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit, + uint32_t op_flags) const { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -910,13 +912,14 @@ SeaStore::Shard::list_objects(CollectionRef ch, return seastar::do_with( RetType(std::vector<ghobject_t>(), start), std::move(limit), - [this, ch, start, end](auto& ret, auto& limit) { - return repeat_eagain([this, ch, start, end, &limit, &ret] { + [this, ch, start, end, op_flags](auto& ret, auto& limit) { + return repeat_eagain([this, ch, start, end, &limit, &ret, op_flags] { ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( Transaction::src_t::READ, "list_objects", + op_flags, [this, ch, start, end, &limit, &ret](auto &t) { LOG_PREFIX(SeaStoreS::list_objects); @@ -1054,6 +1057,7 @@ SeaStore::Shard::list_collections() return transaction_manager->with_transaction_intr( Transaction::src_t::READ, "list_collections", + CACHE_HINT_TOUCH, [this, &ret](auto& t) { LOG_PREFIX(SeaStoreS::list_collections); @@ -1137,6 +1141,7 @@ SeaStore::Shard::read( Transaction::src_t::READ, "read", op_type_t::READ, + op_flags, [this, offset, len, op_flags](auto &t, auto &onode) { return _read(t, onode, offset, len, op_flags); }).finally([this] { @@ -1148,7 +1153,8 @@ SeaStore::Shard::read( SeaStore::Shard::base_errorator::future<bool> SeaStore::Shard::exists( CollectionRef c, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { LOG_PREFIX(SeaStoreS::exists); ++(shard_stats.read_num); @@ -1160,6 +1166,7 @@ SeaStore::Shard::exists( Transaction::src_t::READ, "exists", op_type_t::READ, + op_flags, [FNAME](auto& t, auto&) { DEBUGT("exists", t); return seastar::make_ready_future<bool>(true); @@ -1240,7 +1247,8 @@ SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> SeaStore::Shard::get_attr( CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1251,6 +1259,7 @@ SeaStore::Shard::get_attr( Transaction::src_t::READ, "get_attr", op_type_t::GET_ATTR, + op_flags, [this, name](auto &t, auto& onode) { return _get_attr(t, onode, name); }).handle_error( @@ -1296,7 +1305,8 @@ SeaStore::Shard::_get_attrs( SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t> SeaStore::Shard::get_attrs( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1307,6 +1317,7 @@ SeaStore::Shard::get_attrs( Transaction::src_t::READ, "get_attrs", op_type_t::GET_ATTRS, + op_flags, [this](auto &t, auto& onode) { return _get_attrs(t, onode); }).handle_error( @@ -1338,7 +1349,8 @@ seastar::future<struct stat> SeaStore::Shard::_stat( seastar::future<struct stat> SeaStore::Shard::stat( CollectionRef c, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1349,6 +1361,7 @@ seastar::future<struct stat> SeaStore::Shard::stat( Transaction::src_t::READ, "stat", op_type_t::STAT, + op_flags, [this, oid](auto &t, auto &onode) { return _stat(t, onode, oid); }).handle_error( @@ -1364,9 +1377,10 @@ seastar::future<struct stat> SeaStore::Shard::stat( SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> SeaStore::Shard::omap_get_header( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { - return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY); + return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY, op_flags); } SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t> @@ -1389,7 +1403,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t> SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const omap_keys_t &keys) + const omap_keys_t &keys, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1400,6 +1415,7 @@ SeaStore::Shard::omap_get_values( Transaction::src_t::READ, "omap_get_values", op_type_t::OMAP_GET_VALUES, + op_flags, [this, keys](auto &t, auto &onode) { return do_omap_get_values(t, onode, keys); }).finally([this] { @@ -1529,7 +1545,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t> SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional<std::string> &start) + const std::optional<std::string> &start, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1540,6 +1557,7 @@ SeaStore::Shard::omap_get_values( Transaction::src_t::READ, "omap_get_values2", op_type_t::OMAP_GET_VALUES2, + op_flags, [this, start](auto &t, auto &onode) { return do_omap_get_values(t, onode, start); }).finally([this] { @@ -1589,7 +1607,8 @@ SeaStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1600,6 +1619,7 @@ SeaStore::Shard::fiemap( Transaction::src_t::READ, "fiemap", op_type_t::READ, + op_flags, [this, off, len](auto &t, auto &onode) { return _fiemap(t, onode, off, len); }).finally([this] { @@ -2677,6 +2697,7 @@ seastar::future<> SeaStore::Shard::write_meta( return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "write_meta", + CACHE_HINT_NOCACHE, [this, &key, &value](auto& t) { LOG_PREFIX(SeaStoreS::write_meta); diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index fd7e177da63..e2a993b9e20 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -101,7 +101,8 @@ public: seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<ceph::bufferlist> read( CollectionRef c, @@ -118,32 +119,38 @@ public: base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const final; + std::string_view name, + uint32_t op_flags = 0) const final; get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; /// Retrieves paged set of values > start (if present) read_errorator::future<omap_values_paged_t> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; ///< @return <done, values> values.empty() iff done get_attr_errorator::future<bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; /// std::get<1>(ret) returns end if and only if the listing has listed all /// the items within the range, otherwise it returns the next key to be listed. @@ -151,7 +158,8 @@ public: CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; seastar::future<CollectionRef> open_collection(const coll_t& cid) final; @@ -170,7 +178,8 @@ public: CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags = 0) final; unsigned get_max_attr_name_length() const final { return 256; @@ -251,7 +260,8 @@ public: return seastar::do_with( internal_context_t( ch, std::move(t), - transaction_manager->create_transaction(src, tname)), + transaction_manager->create_transaction( + src, tname, t.get_fadvise_flags())), std::forward<F>(f), [this, op_type](auto &ctx, auto &f) { assert(shard_stats.starting_io_num); @@ -298,20 +308,22 @@ public: Transaction::src_t src, const char* tname, op_type_t op_type, + cache_hint_t cache_hint_flags, F &&f) const { auto begin_time = std::chrono::steady_clock::now(); return seastar::do_with( oid, Ret{}, std::forward<F>(f), - [this, ch, src, op_type, begin_time, tname + [this, ch, src, op_type, begin_time, tname, cache_hint_flags ](auto &oid, auto &ret, auto &f) { - return repeat_eagain([&, this, ch, src, tname] { + return repeat_eagain([&, this, ch, src, tname, cache_hint_flags] { assert(src == Transaction::src_t::READ); ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( src, tname, + cache_hint_flags, [&, this, ch, tname](auto& t) { LOG_PREFIX(SeaStoreS::repeat_with_onode); diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 335a439dcb5..5930469ca07 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -20,9 +20,42 @@ #include "include/intarith.h" #include "include/interval_set.h" #include "include/uuid.h" +#include "include/rados.h" namespace crimson::os::seastore { +class cache_hint_t { + enum hint_t { + TOUCH, + NOCACHE + }; +public: + static constexpr cache_hint_t get_touch() { + return hint_t::TOUCH; + } + static constexpr cache_hint_t get_nocache() { + return hint_t::NOCACHE; + } + cache_hint_t(uint32_t flags) { + if (unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) || + unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) { + hint = NOCACHE; + } + } + bool operator==(const cache_hint_t &other) const { + return hint == other.hint; + } + bool operator!=(const cache_hint_t &other) const { + return hint != other.hint; + } +private: + constexpr cache_hint_t(hint_t hint) : hint(hint) {} + hint_t hint = hint_t::TOUCH; +}; + +inline constexpr cache_hint_t CACHE_HINT_TOUCH = cache_hint_t::get_touch(); +inline constexpr cache_hint_t CACHE_HINT_NOCACHE = cache_hint_t::get_nocache(); + /* using a special xattr key "omap_header" to store omap header */ const std::string OMAP_HEADER_XATTR_KEY = "omap_header"; diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 66a9f896520..cd8c333c69f 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -409,12 +409,14 @@ public: src_t src, journal_seq_t initiated_after, on_destruct_func_t&& f, - transaction_id_t trans_id + transaction_id_t trans_id, + cache_hint_t cache_hint ) : weak(weak), handle(std::move(handle)), on_destruct(std::move(f)), src(src), - trans_id(trans_id) + trans_id(trans_id), + cache_hint(cache_hint) {} void invalidate_clear_write_set() { @@ -573,6 +575,10 @@ public: return pre_alloc_list; } + cache_hint_t get_cache_hint() const { + return cache_hint; + } + private: friend class Cache; friend Ref make_test_transaction(); @@ -682,6 +688,8 @@ private: seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool; backref_entry_refs_t backref_entries; + + cache_hint_t cache_hint = CACHE_HINT_TOUCH; }; using TransactionRef = Transaction::Ref; @@ -694,7 +702,8 @@ inline TransactionRef make_test_transaction() { Transaction::src_t::MUTATE, JOURNAL_SEQ_NULL, [](Transaction&) {}, - ++next_id + ++next_id, + CACHE_HINT_TOUCH ); } diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 753bd5d6ff6..807d88b2cbc 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -66,6 +66,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() return with_transaction_intr( Transaction::src_t::MUTATE, "mkfs_tm", + CACHE_HINT_TOUCH, [this, FNAME](auto& t) { cache->init(); @@ -131,6 +132,7 @@ TransactionManager::mount() journal->get_trimmer().set_journal_head(start_seq); return with_transaction_weak( "mount", + CACHE_HINT_TOUCH, [this](auto &t) { return cache->init_cached_extents(t, [this](auto &t, auto &e) { diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index dc6cc20cf59..e574460894a 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -741,8 +741,9 @@ public: TransactionRef create_transaction( Transaction::src_t src, const char* name, + cache_hint_t cache_hint = CACHE_HINT_TOUCH, bool is_weak=false) final { - return cache->create_transaction(src, name, is_weak); + return cache->create_transaction(src, name, cache_hint, is_weak); } using ExtentCallbackInterface::submit_transaction_direct_ret; diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h index 64544d4c870..ce649303d4f 100644 --- a/src/crimson/osd/backfill_facades.h +++ b/src/crimson/osd/backfill_facades.h @@ -82,6 +82,9 @@ struct PGFacade final : BackfillState::PGFacade { } PGFacade(PG& pg) : pg(pg) {} + std::ostream &print(std::ostream &out) const override { + return out << pg; + } }; } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 837fd2eb2af..f957f072c93 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -8,11 +8,7 @@ #include "crimson/osd/backfill_state.h" #include "osd/osd_types_fmt.h" -namespace { - seastar::logger& logger() { - return crimson::get_logger(ceph_subsys_osd); - } -} +SET_SUBSYS(osd); namespace crimson::osd { @@ -27,22 +23,23 @@ BackfillState::BackfillState( progress_tracker( std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) { - logger().debug("{}:{}", __func__, __LINE__); + LOG_PREFIX(BackfillState::BackfillState); + DEBUGDPP("", *backfill_machine.pg); backfill_machine.initiate(); } template <class S> BackfillState::StateHelper<S>::StateHelper() { - logger().debug("enter {}", - boost::typeindex::type_id<S>().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUGDPP("enter {}", pg(), boost::typeindex::type_id<S>().pretty_name()); } template <class S> BackfillState::StateHelper<S>::~StateHelper() { - logger().debug("exit {}", - boost::typeindex::type_id<S>().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUG("exit {}", boost::typeindex::type_id<S>().pretty_name()); } BackfillState::~BackfillState() = default; @@ -63,13 +60,16 @@ BackfillState::BackfillMachine::~BackfillMachine() = default; BackfillState::Initial::Initial(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Initial::Initial); backfill_state().last_backfill_started = peering_state().earliest_backfill(); - logger().debug("{}: bft={} from {}", - __func__, peering_state().get_backfill_targets(), - backfill_state().last_backfill_started); + DEBUGDPP("{}: bft={} from {}", + pg(), + __func__, + peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); for (const auto& bt : peering_state().get_backfill_targets()) { - logger().debug("{}: target shard {} from {}", - __func__, bt, peering_state().get_peer_last_backfill(bt)); + DEBUGDPP("{}: target shard {} from {}", + pg(), __func__, bt, peering_state().get_peer_last_backfill(bt)); } ceph_assert(peering_state().get_backfill_targets().size()); ceph_assert(!backfill_state().last_backfill_started.is_max()); @@ -80,7 +80,8 @@ BackfillState::Initial::Initial(my_context ctx) boost::statechart::result BackfillState::Initial::react(const BackfillState::Triggered& evt) { - logger().debug("{}: backfill triggered", __func__); + LOG_PREFIX(BackfillState::Initial::react::Triggered); + DEBUGDPP("", pg()); ceph_assert(backfill_state().last_backfill_started == \ peering_state().earliest_backfill()); ceph_assert(peering_state().is_backfilling()); @@ -93,26 +94,10 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) if (Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); + DEBUGDPP("switching to Done state", pg()); return transit<BackfillState::Done>(); } else { - logger().debug("{}: switching to Enqueuing state", __func__); - return transit<BackfillState::Enqueuing>(); - } -} - -boost::statechart::result -BackfillState::Cancelled::react(const BackfillState::Triggered& evt) -{ - logger().debug("{}: backfill re-triggered", __func__); - ceph_assert(peering_state().is_backfilling()); - if (Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); - return transit<BackfillState::Done>(); - } else { - logger().debug("{}: switching to Enqueuing state", __func__); + DEBUGDPP("switching to Enqueuing state", pg()); return transit<BackfillState::Enqueuing>(); } } @@ -120,9 +105,10 @@ BackfillState::Cancelled::react(const BackfillState::Triggered& evt) // -- Enqueuing void BackfillState::Enqueuing::maybe_update_range() { + LOG_PREFIX(BackfillState::Enqueuing::maybe_update_range); if (auto& primary_bi = backfill_state().backfill_info; primary_bi.version >= pg().get_projected_last_update()) { - logger().info("{}: bi is current", __func__); + INFODPP("bi is current", pg()); ceph_assert(primary_bi.version == pg().get_projected_last_update()); } else if (primary_bi.version >= peering_state().get_log_tail()) { if (peering_state().get_pg_log().get_log().empty() && @@ -136,31 +122,31 @@ void BackfillState::Enqueuing::maybe_update_range() ceph_assert(primary_bi.version == eversion_t()); return; } - logger().debug("{}: bi is old, ({}) can be updated with log to {}", - __func__, - primary_bi.version, - pg().get_projected_last_update()); + DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}", + pg(), + primary_bi.version, + pg().get_projected_last_update()); auto func = [&](const pg_log_entry_t& e) { - logger().debug("maybe_update_range(lambda): updating from version {}", - e.version); + DEBUGDPP("maybe_update_range(lambda): updating from version {}", + pg(), e.version); if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { if (e.is_update()) { - logger().debug("maybe_update_range(lambda): {} updated to ver {}", - e.soid, e.version); + DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}", + pg(), e.soid, e.version); primary_bi.objects.erase(e.soid); primary_bi.objects.insert(std::make_pair(e.soid, e.version)); } else if (e.is_delete()) { - logger().debug("maybe_update_range(lambda): {} removed", - e.soid); + DEBUGDPP("maybe_update_range(lambda): {} removed", + pg(), e.soid); primary_bi.objects.erase(e.soid); } } }; - logger().debug("{}: scanning pg log first", __func__); + DEBUGDPP("{}: scanning pg log first", pg()); peering_state().scan_log_after(primary_bi.version, func); - logger().debug("{}: scanning projected log", __func__); + DEBUGDPP("{}: scanning projected log", pg()); pg().get_projected_log().scan_log_after(primary_bi.version, func); primary_bi.version = pg().get_projected_last_update(); } else { @@ -244,6 +230,7 @@ void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( BackfillState::Enqueuing::result_t BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) { + LOG_PREFIX(BackfillState::Enqueuing::remove_on_peers); // set `new_last_backfill_started` to `check` result_t result { {}, check }; for (const auto& bt : peering_state().get_backfill_targets()) { @@ -255,8 +242,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) backfill_listener().enqueue_drop(bt, pbi.begin, version); } } - logger().debug("{}: BACKFILL removing {} from peers {}", - __func__, check, result.pbi_targets); + DEBUGDPP("BACKFILL removing {} from peers {}", + pg(), check, result.pbi_targets); ceph_assert(!result.pbi_targets.empty()); return result; } @@ -264,7 +251,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) BackfillState::Enqueuing::result_t BackfillState::Enqueuing::update_on_peers(const hobject_t& check) { - logger().debug("{}: check={}", __func__, check); + LOG_PREFIX(BackfillState::Enqueuing::update_on_peers); + DEBUGDPP("check={}", pg(), check); const auto& primary_bi = backfill_state().backfill_info; result_t result { {}, primary_bi.begin }; std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills; @@ -325,6 +313,7 @@ bool BackfillState::Enqueuing::Enqueuing::all_emptied( BackfillState::Enqueuing::Enqueuing(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Enqueuing::Enqueuing); auto& primary_bi = backfill_state().backfill_info; // update our local interval to cope with recent changes @@ -334,8 +323,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) // that backfill will be spinning here over and over. For the sake // of performance and complexity we don't synchronize with entire PG. // similar can happen in classical OSD. - logger().warn("{}: bi is old, rescanning of local backfill_info", - __func__); + WARNDPP("bi is old, rescanning of local backfill_info", pg()); post_event(RequestPrimaryScanning{}); return; } else { @@ -347,13 +335,14 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) primary_bi)) { // need to grab one another chunk of the object namespace and restart // the queueing. - logger().debug("{}: reached end for current local chunk", __func__); + DEBUGDPP("reached end for current local chunk", pg()); post_event(RequestPrimaryScanning{}); return; } do { if (!backfill_listener().budget_available()) { + DEBUGDPP("throttle failed, turning to Waiting", pg()); post_event(RequestWaiting{}); return; } else if (should_rescan_replicas(backfill_state().peer_backfill_info, @@ -392,16 +381,25 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) } } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)); - if (backfill_state().progress_tracker->tracked_objects_completed() - && Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - backfill_state().last_backfill_started = hobject_t::get_max(); - backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + DEBUGDPP("reached end for current local chunk", pg()); + post_event(RequestPrimaryScanning{}); + return; + } else { + if (backfill_state().progress_tracker->tracked_objects_completed() + && Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } + DEBUGDPP("reached end for both local and all peers " + "but still has in-flight operations", pg()); + post_event(RequestWaiting{}); } - logger().debug("{}: reached end for both local and all peers " - "but still has in-flight operations", __func__); - post_event(RequestWaiting{}); } // -- PrimaryScanning @@ -416,16 +414,45 @@ BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) boost::statechart::result BackfillState::PrimaryScanning::react(PrimaryScanned evt) { - logger().debug("{}", __func__); + LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned); + DEBUGDPP("", pg()); backfill_state().backfill_info = std::move(evt.result); - return transit<Enqueuing>(); + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill); + DEBUGDPP("suspended within PrimaryScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } boost::statechart::result BackfillState::PrimaryScanning::react(ObjectPushed evt) { - logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::PrimaryScanning::react::ObjectPushed); + DEBUGDPP("PrimaryScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -443,11 +470,11 @@ bool BackfillState::ReplicasScanning::replica_needs_scan( BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::ReplicasScanning::ReplicasScanning); for (const auto& bt : peering_state().get_backfill_targets()) { if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); replica_needs_scan(pbi, backfill_state().backfill_info)) { - logger().debug("{}: scanning peer osd.{} from {}", - __func__, bt, pbi.end); + DEBUGDPP("scanning peer osd.{} from {}", pg(), bt, pbi.end); backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); ceph_assert(waiting_on_backfill.find(bt) == \ @@ -469,8 +496,9 @@ BackfillState::ReplicasScanning::~ReplicasScanning() boost::statechart::result BackfillState::ReplicasScanning::react(ReplicaScanned evt) { - logger().debug("{}: got scan result from osd={}, result={}", - __func__, evt.from, evt.result); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ReplicaScanned); + DEBUGDPP("got scan result from osd={}, result={}", + pg(), evt.from, evt.result); // TODO: maybe we'll be able to move waiting_on_backfill from // the machine to the state. ceph_assert(peering_state().is_backfill_target(evt.from)); @@ -479,12 +507,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) if (waiting_on_backfill.empty()) { ceph_assert(backfill_state().peer_backfill_info.size() == \ peering_state().get_backfill_targets().size()); - return transit<Enqueuing>(); + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } } } else { - // we canceled backfill for a while due to a too full, and this + // we suspended backfill for a while due to a too full, and this // is an extra response from a non-too-full peer - logger().debug("{}: canceled backfill (too full?)", __func__); + DEBUGDPP("suspended backfill (too full?)", pg()); } return discard_event(); } @@ -492,17 +525,30 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) boost::statechart::result BackfillState::ReplicasScanning::react(CancelBackfill evt) { - logger().debug("{}: cancelled within ReplicasScanning", - __func__); - waiting_on_backfill.clear(); - return transit<Cancelled>(); + LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill); + DEBUGDPP("suspended within ReplicasScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } boost::statechart::result BackfillState::ReplicasScanning::react(ObjectPushed evt) { - logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ObjectPushed); + DEBUGDPP("ReplicasScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -517,17 +563,45 @@ BackfillState::Waiting::Waiting(my_context ctx) boost::statechart::result BackfillState::Waiting::react(ObjectPushed evt) { - logger().debug("Waiting::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed); + DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false); - return transit<Enqueuing>();; + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill); + DEBUGDPP("suspended within Waiting", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } // -- Done BackfillState::Done::Done(my_context ctx) : my_base(ctx) { - logger().info("{}: backfill is done", __func__); + LOG_PREFIX(BackfillState::Done::Done); + INFODPP("backfill is done", pg()); backfill_listener().backfilled(); } @@ -537,13 +611,6 @@ BackfillState::Crashed::Crashed() ceph_abort_msg("{}: this should not happen"); } -// -- Cancelled -BackfillState::Cancelled::Cancelled(my_context ctx) - : my_base(ctx) -{ - ceph_assert(peering_state().get_backfill_targets().size()); -} - // ProgressTracker is an intermediary between the BackfillListener and // BackfillMachine + its states. All requests to push or drop an object // are directed through it. The same happens with notifications about @@ -577,8 +644,8 @@ void BackfillState::ProgressTracker::complete_to( const pg_stat_t& stats, bool may_push_to_max) { - logger().debug("{}: obj={}", - __func__, obj); + LOG_PREFIX(BackfillState::ProgressTracker::complete_to); + DEBUGDPP("obj={}", pg(), obj); if (auto completion_iter = registry.find(obj); completion_iter != std::end(registry)) { completion_iter->second = \ @@ -619,4 +686,19 @@ void BackfillState::enqueue_standalone_push( backfill_machine.backfill_listener.enqueue_push(obj, v, peers); } +void BackfillState::enqueue_standalone_delete( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) +{ + progress_tracker->enqueue_drop(obj); + for (auto bt : peers) { + backfill_machine.backfill_listener.enqueue_drop(bt, obj, v); + } +} + +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) { + return pg.print(out); +} + } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index 072c91e079d..517a02ea4df 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -62,6 +62,8 @@ struct BackfillState { struct CancelBackfill : sc::event<CancelBackfill> { }; + struct ThrottleAcquired : sc::event<ThrottleAcquired> { + }; private: // internal events struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { @@ -136,34 +138,10 @@ public: explicit Crashed(); }; - struct Cancelled : sc::state<Cancelled, BackfillMachine>, - StateHelper<Cancelled> { - using reactions = boost::mpl::list< - sc::custom_reaction<Triggered>, - sc::custom_reaction<PrimaryScanned>, - sc::custom_reaction<ReplicaScanned>, - sc::custom_reaction<ObjectPushed>, - sc::transition<sc::event_base, Crashed>>; - explicit Cancelled(my_context); - // resume after triggering backfill by on_activate_complete(). - // transit to Enqueuing. - sc::result react(const Triggered&); - sc::result react(const PrimaryScanned&) { - return discard_event(); - } - sc::result react(const ReplicaScanned&) { - return discard_event(); - } - sc::result react(const ObjectPushed&) { - return discard_event(); - } - }; - struct Initial : sc::state<Initial, BackfillMachine>, StateHelper<Initial> { using reactions = boost::mpl::list< sc::custom_reaction<Triggered>, - sc::transition<CancelBackfill, Cancelled>, sc::transition<sc::event_base, Crashed>>; explicit Initial(my_context); // initialize after triggering backfill by on_activate_complete(). @@ -174,12 +152,9 @@ public: struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, StateHelper<Enqueuing> { using reactions = boost::mpl::list< - sc::transition<CancelBackfill, Cancelled>, sc::transition<RequestPrimaryScanning, PrimaryScanning>, sc::transition<RequestReplicasScanning, ReplicasScanning>, sc::transition<RequestWaiting, Waiting>, - sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, sc::transition<sc::event_base, Crashed>>; explicit Enqueuing(my_context); @@ -237,12 +212,15 @@ public: sc::custom_reaction<ObjectPushed>, sc::custom_reaction<PrimaryScanned>, sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, + sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, sc::transition<sc::event_base, Crashed>>; explicit PrimaryScanning(my_context); sc::result react(ObjectPushed); // collect scanning result and transit to Enqueuing. sc::result react(PrimaryScanned); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, @@ -251,6 +229,7 @@ public: sc::custom_reaction<ObjectPushed>, sc::custom_reaction<ReplicaScanned>, sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, sc::transition<RequestDone, Done>, sc::transition<sc::event_base, Crashed>>; explicit ReplicasScanning(my_context); @@ -259,6 +238,7 @@ public: sc::result react(ObjectPushed); sc::result react(ReplicaScanned); sc::result react(CancelBackfill); + sc::result react(Triggered); // indicate whether a particular peer should be scanned to retrieve // BackfillInterval for new range of hobject_t namespace. @@ -277,10 +257,14 @@ public: using reactions = boost::mpl::list< sc::custom_reaction<ObjectPushed>, sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, + sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, + sc::transition<ThrottleAcquired, Enqueuing>, sc::transition<sc::event_base, Crashed>>; explicit Waiting(my_context); sc::result react(ObjectPushed); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct Done : sc::state<Done, BackfillMachine>, @@ -308,6 +292,11 @@ public: const hobject_t &obj, const eversion_t &v, const std::vector<pg_shard_t> &peers); + void enqueue_standalone_delete( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + bool is_triggered() const { return backfill_machine.triggering_event() != nullptr; @@ -325,6 +314,26 @@ public: } } private: + struct backfill_suspend_state_t { + bool suspended = false; + bool should_go_enqueuing = false; + } backfill_suspend_state; + bool is_suspended() const { + return backfill_suspend_state.suspended; + } + void on_suspended() { + ceph_assert(!is_suspended()); + backfill_suspend_state = {true, false}; + } + bool on_resumed() { + auto go_enqueuing = backfill_suspend_state.should_go_enqueuing; + backfill_suspend_state = {false, false}; + return go_enqueuing; + } + void go_enqueuing_on_resume() { + ceph_assert(is_suspended()); + backfill_suspend_state.should_go_enqueuing = true; + } hobject_t last_backfill_started; BackfillInterval backfill_info; std::map<pg_shard_t, BackfillInterval> peer_backfill_info; @@ -405,8 +414,10 @@ struct BackfillState::PGFacade { virtual const eversion_t& get_projected_last_update() const = 0; virtual const PGLog::IndexedLog& get_projected_log() const = 0; + virtual std::ostream &print(std::ostream &out) const = 0; virtual ~PGFacade() {} }; +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg); class BackfillState::ProgressTracker { // TODO: apply_stat, @@ -433,6 +444,9 @@ class BackfillState::ProgressTracker { BackfillListener& backfill_listener() { return backfill_machine.backfill_listener; } + PGFacade& pg() { + return *backfill_machine.pg; + } public: ProgressTracker(BackfillMachine& backfill_machine) @@ -447,3 +461,9 @@ public: }; } // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::BackfillState::PGFacade> + : fmt::ostream_formatter {}; +#endif + diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h index f8815a316d1..394375c1129 100644 --- a/src/crimson/osd/osd_operation.h +++ b/src/crimson/osd/osd_operation.h @@ -217,6 +217,9 @@ protected: public: static constexpr bool is_trackable = true; + virtual bool requires_pg() const { + return true; + } }; template <class T> @@ -338,6 +341,18 @@ public: with_throttle_while(std::forward<Args>(args)...), *this); } + // Returns std::nullopt if the throttle is acquired immediately, + // returns the future for the acquiring otherwise + std::optional<seastar::future<>> + try_acquire_throttle_now(crimson::osd::scheduler::params_t params) { + if (!max_in_progress || in_progress < max_in_progress) { + ++in_progress; + --pending; + return std::nullopt; + } + return acquire_throttle(params); + } + private: void dump_detail(Formatter *f) const final; diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index 98443bdfc0f..91a6728fd4b 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -42,6 +42,10 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>, unsigned instance_id = 0; public: + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } + /** * instance_handle_t * diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h index e12243ce430..fe4761c4ab4 100644 --- a/src/crimson/osd/osd_operations/logmissing_request.h +++ b/src/crimson/osd/osd_operations/logmissing_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h index 71651d16789..bdb6c2ac6ac 100644 --- a/src/crimson/osd/osd_operations/logmissing_request_reply.h +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h index 85de5c711d6..aa6b8a95a94 100644 --- a/src/crimson/osd/osd_operations/peering_event.h +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -44,6 +44,10 @@ protected: float delay = 0; PGPeeringEvent evt; + epoch_t get_epoch_sent_at() const { + return evt.get_epoch_sent(); + } + const pg_shard_t get_from() const { return from; } @@ -84,6 +88,10 @@ public: evt(std::forward<Args>(args)...) {} + bool requires_pg() const final { + return evt.requires_pg; + } + void print(std::ostream &) const final; void dump_detail(ceph::Formatter* f) const final; seastar::future<> with_pg( diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h index 43be7319545..21702f6ff4f 100644 --- a/src/crimson/osd/osd_operations/pg_advance_map.h +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -50,6 +50,10 @@ public: PGPeeringPipeline::Process::BlockingEvent > tracking_events; + epoch_t get_epoch_sent_at() const { + return to; + } + private: PGPeeringPipeline &peering_pp(PG &pg); }; diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h index 17c2faf97ea..2fe8ff372b3 100644 --- a/src/crimson/osd/osd_operations/recovery_subrequest.h +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -39,6 +39,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return m->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h index 37d2771a33b..c2494b3715f 100644 --- a/src/crimson/osd/osd_operations/replicated_request.h +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/scrub_events.h b/src/crimson/osd/osd_operations/scrub_events.h index 02a5d852bb7..8bed90e4c14 100644 --- a/src/crimson/osd/osd_operations/scrub_events.h +++ b/src/crimson/osd/osd_operations/scrub_events.h @@ -27,11 +27,11 @@ class RemoteScrubEventBaseT : public PhasedOperationT<T> { crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; - epoch_t epoch; spg_t pgid; protected: using interruptor = InterruptibleOperation::interruptor; + epoch_t epoch; template <typename U=void> using ifut = InterruptibleOperation::interruptible_future<U>; @@ -40,7 +40,7 @@ protected: public: RemoteScrubEventBaseT( crimson::net::ConnectionRef conn, epoch_t epoch, spg_t pgid) - : l_conn(std::move(conn)), epoch(epoch), pgid(pgid) {} + : l_conn(std::move(conn)), pgid(pgid), epoch(epoch) {} PGPeeringPipeline &get_peering_pipeline(PG &pg); @@ -117,6 +117,10 @@ public: : RemoteScrubEventBaseT<ScrubRequested>(std::forward<Args>(base_args)...), deep(deep) {} + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(deep=" << deep << ")"; } @@ -141,6 +145,10 @@ public: ceph_assert(scrub::PGScrubber::is_scrub_message(*m)); } + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(m=" << *m << ")"; } diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index bf521498abf..2746e730f2b 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -879,6 +879,17 @@ void PG::enqueue_push_for_backfill( backfill_state->enqueue_standalone_push(obj, v, peers); } +void PG::enqueue_delete_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) +{ + assert(recovery_handler); + assert(recovery_handler->backfill_state); + auto backfill_state = recovery_handler->backfill_state.get(); + backfill_state->enqueue_standalone_delete(obj, v, peers); +} + PG::interruptible_future< std::tuple<PG::interruptible_future<>, PG::interruptible_future<>>> diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 6db73ee835b..06038c0aa00 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -904,6 +904,11 @@ private: const hobject_t &obj, const eversion_t &v, const std::vector<pg_shard_t> &peers); + void enqueue_delete_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const; bool can_discard_op(const MOSDOp& m) const; void context_registry_on_change(); diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc index a40b28caa8b..79895de06de 100644 --- a/src/crimson/osd/pg_backend.cc +++ b/src/crimson/osd/pg_backend.cc @@ -1325,9 +1325,10 @@ maybe_get_omap_vals( PGBackend::ll_read_ierrorator::future<ceph::bufferlist> PGBackend::omap_get_header( const crimson::os::CollectionRef& c, - const ghobject_t& oid) const + const ghobject_t& oid, + uint32_t op_flags) const { - return store->omap_get_header(c, oid) + return store->omap_get_header(c, oid, op_flags) .handle_error( crimson::ct_error::enodata::handle([] { return seastar::make_ready_future<bufferlist>(); @@ -1340,10 +1341,13 @@ PGBackend::ll_read_ierrorator::future<> PGBackend::omap_get_header( const ObjectState& os, OSDOp& osd_op, - object_stat_sum_t& delta_stats) const + object_stat_sum_t& delta_stats, + uint32_t op_flags) const { if (os.oi.is_omap()) { - return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible( + return omap_get_header( + coll, ghobject_t{os.oi.soid}, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED + ).safe_then_interruptible( [&delta_stats, &osd_op] (ceph::bufferlist&& header) { osd_op.outdata = std::move(header); delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); @@ -1707,7 +1711,8 @@ PGBackend::fiemap( CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { return store->fiemap(c, oid, off, len); } diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h index c24176a10e7..9c2230375b0 100644 --- a/src/crimson/osd/pg_backend.h +++ b/src/crimson/osd/pg_backend.h @@ -315,7 +315,8 @@ public: CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len); + uint64_t len, + uint32_t op_flags = 0); write_iertr::future<> tmapput( ObjectState& os, @@ -375,11 +376,13 @@ public: object_stat_sum_t& delta_stats); ll_read_ierrorator::future<ceph::bufferlist> omap_get_header( const crimson::os::CollectionRef& c, - const ghobject_t& oid) const; + const ghobject_t& oid, + uint32_t op_flags = 0) const; ll_read_ierrorator::future<> omap_get_header( const ObjectState& os, OSDOp& osd_op, - object_stat_sum_t& delta_stats) const; + object_stat_sum_t& delta_stats, + uint32_t op_flags = 0) const; interruptible_future<> omap_set_header( ObjectState& os, const OSDOp& osd_op, diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index ec3af0d2b00..5eef584c776 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -67,8 +67,6 @@ PGRecovery::start_recovery_ops( if (max_to_start > 0) { max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started); } - using interruptor = - crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; return interruptor::parallel_for_each(started, [] (auto&& ifut) { return std::move(ifut); @@ -609,8 +607,21 @@ void PGRecovery::update_peers_last_backfill( bool PGRecovery::budget_available() const { - // TODO: the limits! - return true; + crimson::osd::scheduler::params_t params = + {1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort}; + auto &ss = pg->get_shard_services(); + auto futopt = ss.try_acquire_throttle_now(std::move(params)); + if (!futopt) { + return true; + } + std::ignore = interruptor::make_interruptible(std::move(*futopt) + ).then_interruptible([this] { + assert(!backfill_state->is_triggered()); + using BackfillState = crimson::osd::BackfillState; + backfill_state->process_event( + BackfillState::ThrottleAcquired{}.intrusive_from_this()); + }); + return false; } void PGRecovery::on_pg_clean() diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h index 657e6d3e888..5c7b5c5ef2b 100644 --- a/src/crimson/osd/pg_recovery.h +++ b/src/crimson/osd/pg_recovery.h @@ -25,6 +25,8 @@ class PGBackend; class PGRecovery : public crimson::osd::BackfillState::BackfillListener { public: + using interruptor = + crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; template <typename T = void> using interruptible_future = RecoveryBackend::interruptible_future<T>; PGRecovery(PGRecoveryListener* pg) : pg(pg) {} diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h index b9879c8c9dd..f7bd7a6c08e 100644 --- a/src/crimson/osd/pg_shard_manager.h +++ b/src/crimson/osd/pg_shard_manager.h @@ -256,18 +256,40 @@ public: auto &opref = *op; return opref.template with_blocking_event< PGMap::PGCreationBlockingEvent - >([&target_shard_services, &opref](auto &&trigger) { - return target_shard_services.wait_for_pg( - std::move(trigger), opref.get_pgid()); - }).safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) { - logger.debug("{}: have_pg", opref); - return opref.with_pg(target_shard_services, pgref); - }).handle_error( - crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { - logger.debug("{}: pg creation canceled, dropping", opref); - return seastar::now(); - }) - ).then([op=std::move(op)] {}); + >([&target_shard_services, &opref, &logger](auto &&trigger) mutable { + auto pg = target_shard_services.get_pg(opref.get_pgid()); + auto fut = ShardServices::wait_for_pg_ertr::make_ready_future<Ref<PG>>(pg); + if (!pg) { + if (opref.requires_pg()) { + auto osdmap = target_shard_services.get_map(); + if (!osdmap->is_up_acting_osd_shard( + opref.get_pgid(), target_shard_services.local_state.whoami)) { + logger.debug( + "pg {} for {} is no longer here, discarding", + opref.get_pgid(), opref); + opref.get_handle().exit(); + auto _fut = seastar::now(); + if (osdmap->get_epoch() > opref.get_epoch_sent_at()) { + _fut = target_shard_services.send_incremental_map( + std::ref(opref.get_foreign_connection()), + opref.get_epoch_sent_at() + 1); + } + return _fut; + } + } + fut = target_shard_services.wait_for_pg( + std::move(trigger), opref.get_pgid()); + } + return fut.safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(target_shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ); + }).then([op=std::move(op)] {}); } seastar::future<> load_pgs(crimson::os::FuturizedStore& store); diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc index f09cd147ea9..6c8abecffaf 100644 --- a/src/crimson/osd/replicated_backend.cc +++ b/src/crimson/osd/replicated_backend.cc @@ -96,11 +96,18 @@ ReplicatedBackend::submit_transaction( bufferlist encoded_txn; encode(txn, encoded_txn); + bool is_delete = false; for (auto &le : log_entries) { le.mark_unrollbackable(); + if (le.is_delete()) { + is_delete = true; + } } + co_await pg.update_snap_map(log_entries, txn); + std::vector<pg_shard_t> to_push_clone; + std::vector<pg_shard_t> to_push_delete; auto sends = std::make_unique<std::vector<seastar::future<>>>(); for (auto &pg_shard : pg_shards) { if (pg_shard == whoami) { @@ -115,12 +122,17 @@ ReplicatedBackend::submit_transaction( m = new_repop_msg( pg_shard, hoid, encoded_txn, osd_op_p, min_epoch, map_epoch, log_entries, false, tid); - if (_new_clone && pg.is_missing_on_peer(pg_shard, hoid)) { - // The head is in the push queue but hasn't been pushed yet. - // We need to ensure that the newly created clone will be - // pushed as well, otherwise we might skip it. - // See: https://tracker.ceph.com/issues/68808 - to_push_clone.push_back(pg_shard); + if (pg.is_missing_on_peer(pg_shard, hoid)) { + if (_new_clone) { + // The head is in the push queue but hasn't been pushed yet. + // We need to ensure that the newly created clone will be + // pushed as well, otherwise we might skip it. + // See: https://tracker.ceph.com/issues/68808 + to_push_clone.push_back(pg_shard); + } + if (is_delete) { + to_push_delete.push_back(pg_shard); + } } } pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); @@ -130,8 +142,6 @@ ReplicatedBackend::submit_transaction( pg_shard.osd, std::move(m), map_epoch)); } - co_await pg.update_snap_map(log_entries, txn); - pg.log_operation( std::move(log_entries), osd_op_p.pg_trim_to, @@ -157,7 +167,8 @@ ReplicatedBackend::submit_transaction( return seastar::now(); } return peers->all_committed.get_shared_future(); - }).then_interruptible([pending_txn, this, _new_clone, + }).then_interruptible([pending_txn, this, _new_clone, &hoid, + to_push_delete=std::move(to_push_delete), to_push_clone=std::move(to_push_clone)] { auto acked_peers = std::move(pending_txn->second.acked_peers); pending_trans.erase(pending_txn); @@ -167,6 +178,9 @@ ReplicatedBackend::submit_transaction( _new_clone->obs.oi.version, to_push_clone); } + if (!to_push_delete.empty()) { + pg.enqueue_delete_for_backfill(hoid, {}, to_push_delete); + } return seastar::make_ready_future< crimson::osd::acked_peers_t>(std::move(acked_peers)); }); diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc index 76f24196b51..0d6c9d38236 100644 --- a/src/crimson/osd/replicated_recovery_backend.cc +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -35,6 +35,15 @@ ReplicatedRecoveryBackend::recover_object( logger().debug("recover_object: loading obc: {}", soid); return pg.obc_loader.with_obc<RWState::RWREAD>(soid, [this, soid, need](auto head, auto obc) { + if (!obc->obs.exists) { + // XXX: this recovery must be triggered by backfills and the corresponding + // object must have been deleted by some client request after the object + // is enqueued for push but before the lock is acquired by the recovery. + // + // Abort the recovery in this case, a "recover_delete" must have been + // added for this object by the client request that deleted it. + return interruptor::now(); + } logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); auto& recovery_waiter = get_recovering(soid); recovery_waiter.obc = obc; @@ -306,7 +315,10 @@ ReplicatedRecoveryBackend::recover_delete( } return seastar::make_ready_future<>(); }).then_interruptible([this, soid, &stat_diff] { - pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + const auto &missing = pg.get_peering_state().get_pg_log().get_missing(); + if (!missing.is_missing(soid)) { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + } return seastar::make_ready_future<>(); }); }); @@ -568,14 +580,17 @@ ReplicatedRecoveryBackend::read_metadata_for_push_op( return seastar::make_ready_future<eversion_t>(ver); } return interruptor::make_interruptible(interruptor::when_all_succeed( - backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>( + backend->omap_get_header( + coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED + ).handle_error_interruptible<false>( crimson::os::FuturizedStore::Shard::read_errorator::all_same_way( [oid] (const std::error_code& e) { logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid); return seastar::make_ready_future<bufferlist>(); })), - interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid))) - .handle_error_interruptible<false>( + interruptor::make_interruptible( + store->get_attrs(coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + ).handle_error_interruptible<false>( crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way( [oid] (const std::error_code& e) { logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid); @@ -613,8 +628,14 @@ ReplicatedRecoveryBackend::read_object_for_push_op( return seastar::make_ready_future<uint64_t>(offset); } // 1. get the extents in the interested range - return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid}, - 0, copy_subset.range_end())).safe_then_interruptible( + return interruptor::make_interruptible( + backend->fiemap( + coll, + ghobject_t{oid}, + 0, + copy_subset.range_end(), + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + ).safe_then_interruptible( [=, this](auto&& fiemap_included) mutable { interval_set<uint64_t> extents; try { @@ -630,8 +651,12 @@ ReplicatedRecoveryBackend::read_object_for_push_op( push_op->data_included.span_of(extents, offset, max_len); // 3. read the truncated extents // TODO: check if the returned extents are pruned - return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid}, - push_op->data_included, 0)); + return interruptor::make_interruptible( + store->readv( + coll, + ghobject_t{oid}, + push_op->data_included, + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)); }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) { push_op->data.claim_append(std::move(bl)); uint64_t recovered_to = 0; diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index c2340898929..e1acb34636f 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -783,6 +783,11 @@ seastar::future<> ShardServices::dispatch_context_transaction( co_return; } +Ref<PG> ShardServices::get_pg(spg_t pgid) +{ + return local_state.get_pg(pgid); +} + seastar::future<> ShardServices::dispatch_context_messages( BufferedRecoveryMessages &&ctx) { diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h index 56ac4963fff..f1ed9b8d911 100644 --- a/src/crimson/osd/shard_services.h +++ b/src/crimson/osd/shard_services.h @@ -483,6 +483,8 @@ public: return pg_to_shard_mapping.remove_pg_mapping(pgid); } + Ref<PG> get_pg(spg_t pgid); + crimson::common::CephContext *get_cct() { return &(local_state.cct); } @@ -589,6 +591,7 @@ public: FORWARD_TO_OSD_SINGLETON(get_pool_info) FORWARD(with_throttle_while, with_throttle_while, local_state.throttler) + FORWARD(try_acquire_throttle_now, try_acquire_throttle_now, local_state.throttler) FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg) FORWARD_TO_OSD_SINGLETON(send_incremental_map) diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc index 389ecd78afc..870809c5153 100644 --- a/src/crimson/tools/store_nbd/tm_driver.cc +++ b/src/crimson/tools/store_nbd/tm_driver.cc @@ -25,6 +25,7 @@ seastar::future<> TMDriver::write( return tm->with_transaction_intr( Transaction::src_t::MUTATE, "write", + CACHE_HINT_TOUCH, [this, offset, &ptr](auto& t) { return tm->remove(t, laddr_t::from_byte_offset(offset) @@ -112,6 +113,7 @@ seastar::future<bufferlist> TMDriver::read( return tm->with_transaction_intr( Transaction::src_t::READ, "read", + CACHE_HINT_TOUCH, [=, &blret, this](auto& t) { return read_extents(t, laddr_t::from_byte_offset(offset), size diff --git a/src/exporter/ceph_exporter.cc b/src/exporter/ceph_exporter.cc index 44b67c7e615..2232851c094 100644 --- a/src/exporter/ceph_exporter.cc +++ b/src/exporter/ceph_exporter.cc @@ -30,13 +30,13 @@ static void handle_signal(int signum) static void usage() { std::cout << "usage: ceph-exporter [options]\n" << "options:\n" - " --sock-dir: The path to ceph daemons socket files dir\n" - " --addrs: Host ip address where exporter is deployed\n" - " --port: Port to deploy exporter on. Default is 9926\n" - " --cert-file: Path to the certificate file to use https\n" - " --key-file: Path to the certificate key file to use https\n" + " --sock-dir: The path to Ceph daemon sockets (*.asok)\n" + " --addrs: Host IP address on which the exporter is to listen\n" + " --port: TCP Port on which the exporter is to listen. Default is 9926\n" + " --cert-file: Path to the certificate file when using HTTPS\n" + " --key-file: Path to the certificate key file when using HTTPS\n" " --prio-limit: Only perf counters greater than or equal to prio-limit are fetched. Default: 5\n" - " --stats-period: Time to wait before sending requests again to exporter server (seconds). Default: 5s" + " --stats-period: Interval between daemon scrapes (seconds). Default: 5s" << std::endl; generic_server_usage(); } diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp index 4a7ac3ea6e0..0f5a9036eff 100644 --- a/src/include/rados/librados.hpp +++ b/src/include/rados/librados.hpp @@ -202,6 +202,8 @@ inline namespace v14_2_0 { int set_complete_callback(void *cb_arg, callback_t cb); int set_safe_callback(void *cb_arg, callback_t cb) __attribute__ ((deprecated)); + /// Request immediate cancellation as if by IoCtx::aio_cancel(). + int cancel(); int wait_for_complete(); int wait_for_safe() __attribute__ ((deprecated)); int wait_for_complete_and_cb(); @@ -772,17 +774,30 @@ inline namespace v14_2_0 { void tier_evict(); }; - /* IoCtx : This is a context in which we can perform I/O. - * It includes a Pool, + /** + * @brief A handle to a RADOS pool used to perform I/O operations. * * Typical use (error checking omitted): - * + * @code * IoCtx p; * rados.ioctx_create("my_pool", p); - * p->stat(&stats); - * ... etc ... + * p.stat("my_object", &size, &mtime); + * @endcode + * + * IoCtx holds a pointer to its underlying implementation. The dup() + * method performs a deep copy of this implementation, but the copy + * construction and assignment operations perform shallow copies by + * sharing that pointer. + * + * Function names starting with aio_ are asynchronous operations that + * return immediately after submitting a request, and whose completions + * are managed by the given AioCompletion pointer. The IoCtx's underlying + * implementation is involved in the delivery of these completions, so + * the caller must guarantee that its lifetime is preserved until then - + * if not by preserving the IoCtx instance that submitted the request, + * then by a copied/moved instance that shares the same implementation. * - * NOTE: be sure to call watch_flush() prior to destroying any IoCtx + * @note Be sure to call watch_flush() prior to destroying any IoCtx * that is used for watch events to ensure that racing callbacks * have completed. */ @@ -791,9 +806,13 @@ inline namespace v14_2_0 { public: IoCtx(); static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool); + /// Construct a shallow copy of rhs, sharing its underlying implementation. IoCtx(const IoCtx& rhs); + /// Assign a shallow copy of rhs, sharing its underlying implementation. IoCtx& operator=(const IoCtx& rhs); + /// Move construct from rhs, transferring its underlying implementation. IoCtx(IoCtx&& rhs) noexcept; + /// Move assign from rhs, transferring its underlying implementation. IoCtx& operator=(IoCtx&& rhs) noexcept; ~IoCtx(); @@ -1150,7 +1169,8 @@ inline namespace v14_2_0 { int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts); /** - * Cancel aio operation + * Request immediate cancellation with error code -ECANCELED + * if the operation hasn't already completed. * * @param c completion handle * @returns 0 on success, negative error code on failure diff --git a/src/include/random.h b/src/include/random.h index f2e3e37bcd7..6b7c9405efd 100644 --- a/src/include/random.h +++ b/src/include/random.h @@ -16,9 +16,9 @@ #define CEPH_RANDOM_H 1 #include <mutex> +#include <optional> #include <random> #include <type_traits> -#include <boost/optional.hpp> // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85494 #ifdef __MINGW32__ @@ -123,7 +123,7 @@ void randomize_rng() template <typename EngineT> EngineT& engine() { - thread_local boost::optional<EngineT> rng_engine; + thread_local std::optional<EngineT> rng_engine; if (!rng_engine) { rng_engine.emplace(EngineT()); diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h index 858742d511e..d926840180e 100644 --- a/src/kv/KeyValueDB.h +++ b/src/kv/KeyValueDB.h @@ -9,6 +9,7 @@ #include <map> #include <optional> #include <string> +#include <string_view> #include <boost/scoped_ptr.hpp> #include "include/encoding.h" #include "common/Formatter.h" @@ -211,6 +212,10 @@ public: return ""; } virtual ceph::buffer::list value() = 0; + // When valid() returns true, value returned as string-view + // is guaranteed to be valid until iterator is moved to another + // position; that is until call to next() / seek_to_first() / etc. + virtual std::string_view value_as_sv() = 0; virtual int status() = 0; virtual ~SimplestIteratorImpl() {} }; @@ -220,7 +225,12 @@ public: virtual ~IteratorImpl() {} virtual int seek_to_last() = 0; virtual int prev() = 0; + // When valid() returns true, key returned as string-view + // is guaranteed to be valid until iterator is moved to another + // position; that is until call to next() / seek_to_first() / etc. + virtual std::string_view key_as_sv() = 0; virtual std::pair<std::string, std::string> raw_key() = 0; + virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0; virtual ceph::buffer::ptr value_as_ptr() { ceph::buffer::list bl = value(); if (bl.length() == 1) { @@ -247,7 +257,9 @@ public: virtual int next() = 0; virtual int prev() = 0; virtual std::string key() = 0; + virtual std::string_view key_as_sv() = 0; virtual std::pair<std::string,std::string> raw_key() = 0; + virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0; virtual bool raw_key_is_prefixed(const std::string &prefix) = 0; virtual ceph::buffer::list value() = 0; virtual ceph::buffer::ptr value_as_ptr() { @@ -258,6 +270,7 @@ public: return ceph::buffer::ptr(); } } + virtual std::string_view value_as_sv() = 0; virtual int status() = 0; virtual size_t key_size() { return 0; @@ -315,15 +328,24 @@ private: std::string key() override { return generic_iter->key(); } + std::string_view key_as_sv() override { + return generic_iter->key_as_sv(); + } std::pair<std::string, std::string> raw_key() override { return generic_iter->raw_key(); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return generic_iter->raw_key_as_sv(); + } ceph::buffer::list value() override { return generic_iter->value(); } ceph::buffer::ptr value_as_ptr() override { return generic_iter->value_as_ptr(); } + std::string_view value_as_sv() override { + return generic_iter->value_as_sv(); + } int status() override { return generic_iter->status(); } diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index ca63ea06484..51d224b67c0 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -6,6 +6,7 @@ #include <memory> #include <set> #include <string> +#include <string_view> #include <errno.h> #include <unistd.h> #include <sys/types.h> @@ -47,6 +48,7 @@ using std::ostream; using std::pair; using std::set; using std::string; +using std::string_view; using std::unique_ptr; using std::vector; @@ -1992,7 +1994,7 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) // Find separator inside Slice char* separator = (char*) memchr(in.data(), 0, in.size()); - if (separator == NULL) + if (separator == nullptr) return -EINVAL; prefix_len = size_t(separator - in.data()); if (prefix_len >= in.size()) @@ -2006,6 +2008,27 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) return 0; } +// TODO: deduplicate the code, preferrably by removing the string variant +int RocksDBStore::split_key(rocksdb::Slice in, string_view *prefix, string_view *key) +{ + size_t prefix_len = 0; + + // Find separator inside Slice + char* separator = (char*) memchr(in.data(), 0, in.size()); + if (separator == nullptr) + return -EINVAL; + prefix_len = size_t(separator - in.data()); + if (prefix_len >= in.size()) + return -EINVAL; + + // Fetch prefix and/or key directly from Slice + if (prefix) + *prefix = string_view(in.data(), prefix_len); + if (key) + *key = string_view(separator + 1, in.size() - prefix_len - 1); + return 0; +} + void RocksDBStore::compact() { dout(2) << __func__ << " starting" << dendl; @@ -2226,7 +2249,13 @@ int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev() string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key() { string out_key; - split_key(dbiter->key(), 0, &out_key); + split_key(dbiter->key(), nullptr, &out_key); + return out_key; +} +string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::key_as_sv() +{ + string_view out_key; + split_key(dbiter->key(), nullptr, &out_key); return out_key; } pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() @@ -2235,6 +2264,12 @@ pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() split_key(dbiter->key(), &prefix, &key); return make_pair(prefix, key); } +pair<string_view,string_view> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_as_sv() +{ + string_view prefix, key; + split_key(dbiter->key(), &prefix, &key); + return make_pair(prefix, key); +} bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) { // Look for "prefix\0" right in rocksb::Slice @@ -2267,6 +2302,12 @@ bufferptr RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_ptr() return bufferptr(val.data(), val.size()); } +std::string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_sv() +{ + rocksdb::Slice val = dbiter->value(); + return std::string_view{val.data(), val.size()}; +} + int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status() { return dbiter->status().ok() ? 0 : -1; @@ -2348,9 +2389,15 @@ public: string key() override { return dbiter->key().ToString(); } + string_view key_as_sv() override { + return dbiter->key().ToStringView(); + } std::pair<std::string, std::string> raw_key() override { return make_pair(prefix, key()); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return make_pair(prefix, dbiter->key().ToStringView()); + } bufferlist value() override { return to_bufferlist(dbiter->value()); } @@ -2358,6 +2405,10 @@ public: rocksdb::Slice val = dbiter->value(); return bufferptr(val.data(), val.size()); } + std::string_view value_as_sv() override { + rocksdb::Slice val = dbiter->value(); + return std::string_view{val.data(), val.size()}; + } int status() override { return dbiter->status().ok() ? 0 : -1; } @@ -2668,6 +2719,15 @@ public: } } + std::string_view key_as_sv() override + { + if (smaller == on_main) { + return main->key_as_sv(); + } else { + return current_shard->second->key_as_sv(); + } + } + std::pair<std::string,std::string> raw_key() override { if (smaller == on_main) { @@ -2677,6 +2737,15 @@ public: } } + std::pair<std::string_view,std::string_view> raw_key_as_sv() override + { + if (smaller == on_main) { + return main->raw_key_as_sv(); + } else { + return { current_shard->first, current_shard->second->key_as_sv() }; + } + } + bool raw_key_is_prefixed(const std::string &prefix) override { if (smaller == on_main) { @@ -2695,6 +2764,15 @@ public: } } + std::string_view value_as_sv() override + { + if (smaller == on_main) { + return main->value_as_sv(); + } else { + return current_shard->second->value_as_sv(); + } + } + int status() override { //because we already had to inspect key, it must be ok @@ -3017,9 +3095,15 @@ public: string key() override { return iters[0]->key().ToString(); } + string_view key_as_sv() override { + return iters[0]->key().ToStringView(); + } std::pair<std::string, std::string> raw_key() override { return make_pair(prefix, key()); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return make_pair(prefix, iters[0]->key().ToStringView()); + } bufferlist value() override { return to_bufferlist(iters[0]->value()); } @@ -3027,6 +3111,10 @@ public: rocksdb::Slice val = iters[0]->value(); return bufferptr(val.data(), val.size()); } + std::string_view value_as_sv() override { + rocksdb::Slice val = iters[0]->value(); + return std::string_view{val.data(), val.size()}; + } int status() override { return iters[0]->status().ok() ? 0 : -1; } diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 477b209854c..50b91be2bf6 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -386,10 +386,13 @@ public: int next() override; int prev() override; std::string key() override; + std::string_view key_as_sv() override; std::pair<std::string,std::string> raw_key() override; + std::pair<std::string_view,std::string_view> raw_key_as_sv() override; bool raw_key_is_prefixed(const std::string &prefix) override; ceph::bufferlist value() override; ceph::bufferptr value_as_ptr() override; + std::string_view value_as_sv() override; int status() override; size_t key_size() override; size_t value_size() override; @@ -419,6 +422,7 @@ public: } static int split_key(rocksdb::Slice in, std::string *prefix, std::string *key); + static int split_key(rocksdb::Slice in, std::string_view *prefix, std::string_view *key); static std::string past_prefix(const std::string &prefix); diff --git a/src/librados/librados_asio.h b/src/librados/librados_asio.h index 0aedc376575..3e5b7c57c6f 100644 --- a/src/librados/librados_asio.h +++ b/src/librados/librados_asio.h @@ -14,6 +14,9 @@ #ifndef LIBRADOS_ASIO_H #define LIBRADOS_ASIO_H +#include <boost/asio/associated_cancellation_slot.hpp> +#include <boost/asio/cancellation_type.hpp> + #include "include/rados/librados.hpp" #include "common/async/completion.h" #include "librados/AioCompletionImpl.h" @@ -74,6 +77,7 @@ struct Invoker<void> { template <typename Result> struct AsyncOp : Invoker<Result> { unique_aio_completion_ptr aio_completion; + boost::asio::cancellation_slot slot; using Signature = typename Invoker<Result>::Signature; using Completion = ceph::async::Completion<Signature, AsyncOp<Result>>; @@ -83,6 +87,7 @@ struct AsyncOp : Invoker<Result> { auto p = std::unique_ptr<Completion>{static_cast<Completion*>(arg)}; // move result out of Completion memory being freed auto op = std::move(p->user_data); + op.slot.clear(); // clear our cancellation handler // access AioCompletionImpl directly to avoid locking const librados::AioCompletionImpl* pc = op.aio_completion->pc; const int ret = pc->rval; @@ -94,11 +99,46 @@ struct AsyncOp : Invoker<Result> { op.dispatch(std::move(p), ec, ver); } + struct op_cancellation { + AioCompletion* completion = nullptr; + bool is_read = false; + + void operator()(boost::asio::cancellation_type type) { + if (completion == nullptr) { + return; // no AioCompletion attached + } else if (type == boost::asio::cancellation_type::none) { + return; // no cancellation requested + } else if (is_read) { + // read operations produce no side effects, so can satisfy the + // requirements of 'total' cancellation. the weaker requirements + // of 'partial' and 'terminal' are also satisfied + completion->cancel(); + } else if (type == boost::asio::cancellation_type::terminal) { + // write operations only support 'terminal' cancellation because we + // can't guarantee that no osd has succeeded (or will succeed) in + // applying the write + completion->cancel(); + } + } + }; + template <typename Executor1, typename CompletionHandler> - static auto create(const Executor1& ex1, CompletionHandler&& handler) { + static auto create(const Executor1& ex1, bool is_read, + CompletionHandler&& handler) { + op_cancellation* cancel_handler = nullptr; + auto slot = boost::asio::get_associated_cancellation_slot(handler); + if (slot.is_connected()) { + cancel_handler = &slot.template emplace<op_cancellation>(); + } + auto p = Completion::create(ex1, std::move(handler)); p->user_data.aio_completion.reset( Rados::aio_create_completion(p.get(), aio_dispatch)); + if (cancel_handler) { + cancel_handler->completion = p->user_data.aio_completion.get(); + cancel_handler->is_read = is_read; + p->user_data.slot = std::move(slot); + } return p; } }; @@ -108,6 +148,9 @@ struct AsyncOp : Invoker<Result> { /// Calls IoCtx::aio_read() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t, bufferlist). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid, size_t len, uint64_t off, CompletionToken&& token) @@ -117,7 +160,8 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, size_t len, uint64_t off) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = true; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_read(oid, op.aio_completion.get(), &op.result, len, off); @@ -132,6 +176,9 @@ auto async_read(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_write() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid, const bufferlist &bl, size_t len, uint64_t off, @@ -142,7 +189,8 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, const bufferlist &bl, size_t len, uint64_t off) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = false; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_write(oid, op.aio_completion.get(), bl, len, off); @@ -157,6 +205,9 @@ auto async_write(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t, bufferlist). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, ObjectReadOperation *read_op, int flags, @@ -167,7 +218,8 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, ObjectReadOperation *read_op, int flags) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = true; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_operate(oid, op.aio_completion.get(), read_op, @@ -183,6 +235,9 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_operate() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, ObjectWriteOperation *write_op, int flags, @@ -194,7 +249,8 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, [] (auto handler, auto ex, IoCtx& io, const std::string& oid, ObjectWriteOperation *write_op, int flags, const jspan_context* trace_ctx) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = false; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_operate(oid, op.aio_completion.get(), write_op, flags, trace_ctx); @@ -209,6 +265,9 @@ auto async_operate(ExecutionContext& ctx, IoCtx& io, const std::string& oid, /// Calls IoCtx::aio_notify() and arranges for the AioCompletion to call a /// given handler with signature (error_code, version_t, bufferlist). +/// +/// The given IoCtx reference is not required to remain valid, but some IoCtx +/// instance must preserve its underlying implementation until completion. template <typename ExecutionContext, typename CompletionToken> auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid, bufferlist& bl, uint64_t timeout_ms, CompletionToken &&token) @@ -218,7 +277,8 @@ auto async_notify(ExecutionContext& ctx, IoCtx& io, const std::string& oid, return boost::asio::async_initiate<CompletionToken, Signature>( [] (auto handler, auto ex, IoCtx& io, const std::string& oid, bufferlist& bl, uint64_t timeout_ms) { - auto p = Op::create(ex, std::move(handler)); + constexpr bool is_read = false; + auto p = Op::create(ex, is_read, std::move(handler)); auto& op = p->user_data; int ret = io.aio_notify(oid, op.aio_completion.get(), diff --git a/src/librados/librados_cxx.cc b/src/librados/librados_cxx.cc index 2167eeade3c..60217b99b41 100644 --- a/src/librados/librados_cxx.cc +++ b/src/librados/librados_cxx.cc @@ -1103,6 +1103,14 @@ void librados::AioCompletion::release() delete this; } +int librados::AioCompletion::cancel() +{ + if (!pc->io) { + return 0; // no operation was started + } + return pc->io->aio_cancel(pc); +} + ///////////////////////////// IoCtx ////////////////////////////// librados::IoCtx::IoCtx() : io_ctx_impl(NULL) { diff --git a/src/mgr/PyModule.h b/src/mgr/PyModule.h index 177447c2cb3..a47db3a47ef 100644 --- a/src/mgr/PyModule.h +++ b/src/mgr/PyModule.h @@ -161,9 +161,9 @@ public: } const std::string &get_name() const { - std::lock_guard l(lock) ; return module_name; + return module_name; } - const std::string &get_error_string() const { + std::string get_error_string() const { std::lock_guard l(lock) ; return error_string; } bool get_can_run() const { diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc index 719403925ad..2d2735f1e7c 100755 --- a/src/mon/NVMeofGwMap.cc +++ b/src/mon/NVMeofGwMap.cc @@ -171,6 +171,8 @@ int NVMeofGwMap::cfg_delete_gw( << state.availability << " Resulting GW availability: " << state.availability << dendl; state.subsystems.clear();//ignore subsystems of this GW + utime_t now = ceph_clock_now(); + mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now; return 0; } } @@ -895,10 +897,12 @@ struct CMonRequestProposal : public Context { } }; -void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const +void NVMeofGwMap::get_health_checks(health_check_map_t *checks) { list<string> singleGatewayDetail; list<string> gatewayDownDetail; + list<string> gatewayInDeletingDetail; + int deleting_gateways = 0; for (const auto& created_map_pair: created_gws) { const auto& group_key = created_map_pair.first; auto& group = group_key.second; @@ -915,9 +919,37 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ostringstream ss; ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ; gatewayDownDetail.push_back(ss.str()); + } else if (gw_created.availability == gw_availability_t::GW_DELETING) { + deleting_gateways++; + utime_t now = ceph_clock_now(); + bool found_deleting_time = false; + auto gws_deleting_time = mon->nvmegwmon()->gws_deleting_time; + auto group_it = gws_deleting_time.find(group_key); + if (group_it != gws_deleting_time.end()) { + auto& gw_map = group_it->second; + auto gw_it = gw_map.find(gw_id); + if (gw_it != gw_map.end()) { + found_deleting_time = true; + utime_t delete_time = gw_it->second; + if ((now - delete_time) > g_conf().get_val<std::chrono::seconds>("mon_nvmeofgw_delete_grace").count()) { + ostringstream ss; + ss << "NVMeoF Gateway '" << gw_id << "' is in deleting state."; + gatewayInDeletingDetail.push_back(ss.str()); + } + } + } + if (!found_deleting_time) { + // DELETING gateway not found in gws_deleting_time, set timeout now + mon->nvmegwmon()->gws_deleting_time[group_key][gw_id] = now; + } } } } + if (deleting_gateways == 0) { + // no gateway in GW_DELETING state currently, flush old gws_deleting_time + mon->nvmegwmon()->gws_deleting_time.clear(); + } + if (!singleGatewayDetail.empty()) { ostringstream ss; ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway" @@ -934,6 +966,15 @@ void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const ss.str(), gatewayDownDetail.size()); d.detail.swap(gatewayDownDetail); } + if (!gatewayInDeletingDetail.empty()) { + ostringstream ss; + ss << gatewayInDeletingDetail.size() << " gateway(s) are in deleting state" + << "; namespaces are automatically balanced across remaining gateways, " + << "this should take a few minutes."; + auto& d = checks->add("NVMEOF_GATEWAY_DELETING", HEALTH_WARN, + ss.str(), gatewayInDeletingDetail.size()); + d.detail.swap(gatewayInDeletingDetail); + } } int NVMeofGwMap::blocklist_gw( diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h index 5f657733012..85fd62b3a07 100755 --- a/src/mon/NVMeofGwMap.h +++ b/src/mon/NVMeofGwMap.h @@ -144,7 +144,7 @@ public: DECODE_FINISH(bl); } - void get_health_checks(health_check_map_t *checks) const; + void get_health_checks(health_check_map_t *checks); }; #include "NVMeofGwSerialize.h" diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h index 7fae8b766a5..d7f5fd89cde 100644 --- a/src/mon/NVMeofGwMon.h +++ b/src/mon/NVMeofGwMon.h @@ -82,6 +82,8 @@ public: void check_subs(bool type); void check_sub(Subscription *sub); + std::map<NvmeGroupKey, std::map<NvmeGwId, utime_t>> gws_deleting_time; + private: void synchronize_last_beacon(); void process_gw_down(const NvmeGwId &gw_id, diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc index 7da9a67be62..65627b5f818 100644 --- a/src/os/DBObjectMap.cc +++ b/src/os/DBObjectMap.cc @@ -519,6 +519,11 @@ bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() return cur_iter->value(); } +std::string_view DBObjectMap::DBObjectMapIteratorImpl::value_as_sv() +{ + return cur_iter->value_as_sv(); +} + int DBObjectMap::DBObjectMapIteratorImpl::status() { return r; diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h index 444f21eb815..1e1452010e7 100644 --- a/src/os/DBObjectMap.h +++ b/src/os/DBObjectMap.h @@ -393,6 +393,7 @@ private: int next() override { ceph_abort(); return 0; } std::string key() override { ceph_abort(); return ""; } ceph::buffer::list value() override { ceph_abort(); return ceph::buffer::list(); } + std::string_view value_as_sv() override { ceph_abort(); return std::string_view(); } int status() override { return 0; } }; @@ -431,6 +432,7 @@ private: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; int status() override; bool on_parent() { diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 521435b6c31..df3ae920a2f 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -29,6 +29,7 @@ #include <errno.h> #include <sys/stat.h> +#include <functional> #include <map> #include <memory> #include <vector> @@ -735,15 +736,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) = 0; -#ifdef WITH_SEASTAR - virtual int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) = 0; -#endif - /// Filters keys into out which are defined on oid virtual int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -766,6 +758,48 @@ public: const ghobject_t &oid ///< [in] object ) = 0; + struct omap_iter_seek_t { + std::string seek_position; + enum { + // start with provided key (seek_position), if it exists + LOWER_BOUND, + // skip provided key (seek_position) even if it exists + UPPER_BOUND + } seek_type = LOWER_BOUND; + static omap_iter_seek_t min_lower_bound() { return {}; } + }; + enum class omap_iter_ret_t { + STOP, + NEXT + }; + /** + * Iterate over object map with user-provided callable + * + * Warning! The callable is executed under lock on bluestore + * operations in c. Do not use bluestore methods on c while + * iterating. (Filling in a transaction is no problem). + * + * @param c collection + * @param oid object + * @param start_from where the iterator should point to at + * the beginning + * @param visitor callable that takes OMAP key and corresponding + * value as string_views and controls iteration + * by the return. It is executed for every object's + * OMAP entry from `start_from` till end of the + * object's OMAP or till the iteration is stopped + * by `STOP`. Please note that if there is no such + * entry, `visitor` will be called 0 times. + * @return error code, zero on success + */ + virtual int omap_iterate( + CollectionHandle &c, + const ghobject_t &oid, + omap_iter_seek_t start_from, + std::function<omap_iter_ret_t(std::string_view, + std::string_view)> visitor + ) = 0; + virtual int flush_journal() { return -EOPNOTSUPP; } virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index a024a0c2105..25e6c4fe596 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4830,7 +4830,7 @@ void BlueStore::Onode::rewrite_omap_key(const string& old, string *out) out->append(old.c_str() + out->length(), old.size() - out->length()); } -void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +size_t BlueStore::Onode::calc_userkey_offset_in_omap_key() const { size_t pos = sizeof(uint64_t) + 1; if (!onode.is_pgmeta_omap()) { @@ -4840,9 +4840,15 @@ void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) pos += sizeof(uint64_t); } } - *user_key = key.substr(pos); + return pos; } +void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +{ + *user_key = key.substr(calc_userkey_offset_in_omap_key()); +} + + void BlueStore::Onode::finish_write(TransContext* txc, uint32_t offset, uint32_t length) { while (true) { @@ -5519,7 +5525,13 @@ BlueStore::OmapIteratorImpl::OmapIteratorImpl( if (o->onode.has_omap()) { o->get_omap_key(string(), &head); o->get_omap_tail(&tail); + auto start1 = mono_clock::now(); it->lower_bound(head); + c->store->log_latency( + __func__, + l_bluestore_omap_seek_to_first_lat, + mono_clock::now() - start1, + c->store->cct->_conf->bluestore_log_omap_iterator_age); } } BlueStore::OmapIteratorImpl::~OmapIteratorImpl() @@ -5654,6 +5666,13 @@ bufferlist BlueStore::OmapIteratorImpl::value() return it->value(); } +std::string_view BlueStore::OmapIteratorImpl::value_as_sv() +{ + std::shared_lock l(c->lock); + ceph_assert(it->valid()); + return it->value_as_sv(); +} + // ===================================== @@ -13601,52 +13620,6 @@ int BlueStore::omap_get_values( return r; } -#ifdef WITH_SEASTAR -int BlueStore::omap_get_values( - CollectionHandle &c_, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<string> &start_after, ///< [in] Keys to get - map<string, bufferlist> *output ///< [out] Returned keys and values - ) -{ - Collection *c = static_cast<Collection *>(c_.get()); - dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; - if (!c->exists) - return -ENOENT; - std::shared_lock l(c->lock); - int r = 0; - OnodeRef o = c->get_onode(oid, false); - if (!o || !o->exists) { - r = -ENOENT; - goto out; - } - if (!o->onode.has_omap()) { - goto out; - } - o->flush(); - { - ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid); - if (!iter) { - r = -ENOENT; - goto out; - } - if (start_after) { - iter->upper_bound(*start_after); - } else { - iter->seek_to_first(); - } - for (; iter->valid(); iter->next()) { - output->insert(make_pair(iter->key(), iter->value())); - } - } - -out: - dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r - << dendl; - return r; -} -#endif - int BlueStore::omap_check_keys( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -13724,6 +13697,94 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it)); } +int BlueStore::omap_iterate( + CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + if (!c->exists) { + return -ENOENT; + } + std::shared_lock l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return -ENOENT; + } + o->flush(); + dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl; + if (!o->onode.has_omap()) { + // nothing to do + return 0; + } + + KeyValueDB::Iterator it; + { + auto bounds = KeyValueDB::IteratorBounds(); + std::string lower_bound, upper_bound; + o->get_omap_key(string(), &lower_bound); + o->get_omap_tail(&upper_bound); + bounds.lower_bound = std::move(lower_bound); + bounds.upper_bound = std::move(upper_bound); + it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds)); + } + + // seek the iterator + { + std::string key; + o->get_omap_key(start_from.seek_position, &key); + auto start = ceph::mono_clock::now(); + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it->lower_bound(key); + c->store->log_latency( + __func__, + l_bluestore_omap_lower_bound_lat, + ceph::mono_clock::now() - start, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + } else { + it->upper_bound(key); + c->store->log_latency( + __func__, + l_bluestore_omap_upper_bound_lat, + ceph::mono_clock::now() - start, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + } + } + + // iterate! + std::string tail; + o->get_omap_tail(&tail); + const std::string_view::size_type userkey_offset_in_dbkey = + o->calc_userkey_offset_in_omap_key(); + ceph::timespan next_lat_acc{0}; + while (it->valid()) { + const auto& db_key = it->raw_key_as_sv().second; + if (db_key >= tail) { + break; + } + std::string_view user_key = db_key.substr(userkey_offset_in_dbkey); + omap_iter_ret_t ret = f(user_key, it->value_as_sv()); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + ceph::time_guard<ceph::mono_clock>{next_lat_acc}; + it->next(); + } else { + ceph_abort(); + } + } + c->store->log_latency( + __func__, + l_bluestore_omap_next_lat, + next_lat_acc, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + return 0; +} + // ----------------- // write helpers diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 99f8d057cf0..5549f97ffea 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1457,6 +1457,7 @@ public: } void rewrite_omap_key(const std::string& old, std::string *out); + size_t calc_userkey_offset_in_omap_key() const; void decode_omap_key(const std::string& key, std::string *user_key); void finish_write(TransContext* txc, uint32_t offset, uint32_t length); @@ -1753,6 +1754,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; std::string tail_key() override { return tail; } @@ -3416,15 +3418,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif - /// Filters keys into out which are defined on oid int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -3438,6 +3431,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc index 7158486ca38..a069d429155 100644 --- a/src/os/kstore/KStore.cc +++ b/src/os/kstore/KStore.cc @@ -1651,6 +1651,13 @@ bufferlist KStore::OmapIteratorImpl::value() return it->value(); } +std::string_view KStore::OmapIteratorImpl::value_as_sv() +{ + std::shared_lock l{c->lock}; + ceph_assert(it->valid()); + return it->value_as_sv(); +} + int KStore::omap_get( CollectionHandle& ch, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -1866,6 +1873,71 @@ ObjectMap::ObjectMapIterator KStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); } +int KStore::omap_iterate( + CollectionHandle &ch, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + { + std::shared_lock l{c->lock}; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return -ENOENT; + } + o->flush(); + dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl; + + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + std::string tail; + std::string seek_key; + if (o->onode.omap_head) { + return 0; // nothing to do + } + + // acquire data depedencies for seek & iterate + get_omap_key(o->onode.omap_head, start_from.seek_position, &seek_key); + get_omap_tail(o->onode.omap_head, &tail); + + // acquire the iterator + { + it = db->get_iterator(PREFIX_OMAP); + } + + // seek the iterator + { + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it->lower_bound(seek_key); + } else { + it->upper_bound(seek_key); + } + } + + // iterate! + while (it->valid()) { + std::string user_key; + if (const auto& db_key = it->raw_key().second; db_key >= tail) { + break; + } else { + decode_omap_key(db_key, &user_key); + } + omap_iter_ret_t ret = f(user_key, it->value_as_sv()); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + it->next(); + } else { + ceph_abort(); + } + } + } + return 0; +} + // ----------------- // write helpers diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h index 9a9d413c66a..06115d3cab7 100644 --- a/src/os/kstore/KStore.h +++ b/src/os/kstore/KStore.h @@ -180,6 +180,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; int status() override { return 0; } @@ -553,6 +554,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc index 89cb09361cf..f9d3bf0d8a2 100644 --- a/src/os/memstore/MemStore.cc +++ b/src/os/memstore/MemStore.cc @@ -537,30 +537,6 @@ int MemStore::omap_get_values( return 0; } -#ifdef WITH_SEASTAR -int MemStore::omap_get_values( - CollectionHandle& ch, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) -{ - dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; - Collection *c = static_cast<Collection*>(ch.get()); - ObjectRef o = c->get_object(oid); - if (!o) - return -ENOENT; - assert(start_after); - std::lock_guard lock{o->omap_mutex}; - for (auto it = o->omap.upper_bound(*start_after); - it != std::end(o->omap); - ++it) { - out->insert(*it); - } - return 0; -} -#endif - int MemStore::omap_check_keys( CollectionHandle& ch, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -622,6 +598,10 @@ public: std::lock_guard lock{o->omap_mutex}; return it->second; } + std::string_view value_as_sv() override { + std::lock_guard lock{o->omap_mutex}; + return std::string_view{it->second.c_str(), it->second.length()}; + } int status() override { return 0; } @@ -639,6 +619,48 @@ ObjectMap::ObjectMapIterator MemStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o)); } +int MemStore::omap_iterate( + CollectionHandle &ch, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f) +{ + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + + { + std::lock_guard lock{o->omap_mutex}; + + // obtain seek the iterator + decltype(o->omap)::iterator it; + { + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it = o->omap.lower_bound(start_from.seek_position); + } else { + it = o->omap.upper_bound(start_from.seek_position); + } + } + + // iterate! + while (it != o->omap.end()) { + // potentially rectifying memcpy but who cares for memstore? + omap_iter_ret_t ret = + f(it->first, std::string_view{it->second.c_str(), it->second.length()}); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + ++it; + } else { + ceph_abort(); + } + } + } + return 0; +} + // --------------- // write operations diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h index 2abe552891f..9621773598f 100644 --- a/src/os/memstore/MemStore.h +++ b/src/os/memstore/MemStore.h @@ -363,14 +363,6 @@ public: const std::set<std::string> &keys, ///< [in] Keys to get std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif using ObjectStore::omap_check_keys; /// Filters keys into out which are defined on oid @@ -387,6 +379,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override; uuid_d get_fsid() override; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index fa2570aba42..8630b038812 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -945,6 +945,10 @@ void ECBackend::handle_sub_write( } trace.event("handle_sub_write"); + if (cct->_conf->bluestore_debug_inject_read_err && + ec_inject_test_write_error3(op.soid)) { + ceph_abort_msg("Error inject - OSD down"); + } if (!get_parent()->pgb_is_primary()) get_parent()->update_stats(op.stats); ObjectStore::Transaction localt; @@ -1191,6 +1195,15 @@ void ECBackend::handle_sub_write_reply( i->second->on_all_commit = 0; i->second->trace.event("ec write all committed"); } + if (cct->_conf->bluestore_debug_inject_read_err && + (i->second->pending_commit.size() == 1) && + ec_inject_test_write_error2(i->second->hoid)) { + std::string cmd = + "{ \"prefix\": \"osd down\", \"ids\": [\"" + std::to_string( get_parent()->whoami() ) + "\"] }"; + vector<std::string> vcmd{cmd}; + dout(0) << __func__ << " Error inject - marking OSD down" << dendl; + get_parent()->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr); + } rmw_pipeline.check_ops(); } @@ -1208,6 +1221,19 @@ void ECBackend::handle_sub_read_reply( return; } ReadOp &rop = iter->second; + if (cct->_conf->bluestore_debug_inject_read_err) { + for (auto i = op.buffers_read.begin(); + i != op.buffers_read.end(); + ++i) { + if (ec_inject_test_read_error0(ghobject_t(i->first, ghobject_t::NO_GEN, op.from.shard))) { + dout(0) << __func__ << " Error inject - EIO error for shard " << op.from.shard << dendl; + op.buffers_read.erase(i->first); + op.attrs_read.erase(i->first); + op.errors[i->first] = -EIO; + } + + } + } for (auto i = op.buffers_read.begin(); i != op.buffers_read.end(); ++i) { diff --git a/src/osd/ECCommon.cc b/src/osd/ECCommon.cc index 609ac3141ae..59077547fcb 100644 --- a/src/osd/ECCommon.cc +++ b/src/osd/ECCommon.cc @@ -226,8 +226,14 @@ void ECCommon::ReadPipeline::get_all_avail_shards( ++i) { dout(10) << __func__ << ": checking acting " << *i << dendl; const pg_missing_t &missing = get_parent()->get_shard_missing(*i); - if (error_shards.find(*i) != error_shards.end()) + if (error_shards.contains(*i)) { continue; + } + if (cct->_conf->bluestore_debug_inject_read_err && + ec_inject_test_read_error1(ghobject_t(hoid, ghobject_t::NO_GEN, i->shard))) { + dout(0) << __func__ << " Error inject - Missing shard " << i->shard << dendl; + continue; + } if (!missing.is_missing(hoid)) { ceph_assert(!have.count(i->shard)); have.insert(i->shard); @@ -912,6 +918,11 @@ bool ECCommon::RMWPipeline::try_reads_to_commit() if (*i == get_parent()->whoami_shard()) { should_write_local = true; local_write_op.claim(sop); + } else if (cct->_conf->bluestore_debug_inject_read_err && + ec_inject_test_write_error1(ghobject_t(op->hoid, + ghobject_t::NO_GEN, i->shard))) { + dout(0) << " Error inject - Dropping write message to shard " << + i->shard << dendl; } else { MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop); r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard); @@ -1090,3 +1101,305 @@ ECUtil::HashInfoRef ECCommon::UnstableHashInfoRegistry::get_hash_info( } return ref; } + +// Error inject interfaces +static ceph::recursive_mutex ec_inject_lock = + ceph::make_recursive_mutex("ECCommon::ec_inject_lock"); +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_read_failures0; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_read_failures1; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures0; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures1; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures2; +static std::map<ghobject_t,std::pair<int64_t,int64_t>> ec_inject_write_failures3; +static std::map<ghobject_t,shard_id_t> ec_inject_write_failures0_shard; +static std::set<osd_reqid_t> ec_inject_write_failures0_reqid; + +/** + * Configure a read error inject that typically forces additional reads of + * shards in an EC pool to recover data using the redundancy. With multiple + * errors it is possible to force client reads to fail. + * + * Type 0 - Simulate a medium error. Fail a read with -EIO to force + * additional reads and a decode + * + * Type 1 - Simulate a missing OSD. Dont even try to read a shard + * + * @brief Set up a read error inject for an object in an EC pool. + * @param o Target object for the error inject. + * @param when Error inject starts after this many object store reads. + * @param duration Error inject affects this many object store reads. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Result of configuring the error inject. + */ +std::string ec_inject_read_error(const ghobject_t& o, + const int64_t type, + const int64_t when, + const int64_t duration) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + ghobject_t os = o; + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + switch (type) { + case 0: + ec_inject_read_failures0[os] = std::pair(when, duration); + return "ok - read returns EIO"; + case 1: + ec_inject_read_failures1[os] = std::pair(when, duration); + return "ok - read pretends shard is missing"; + default: + break; + } + return "unrecognized error inject type"; +} + +/** + * Configure a write error inject that either fails an OSD or causes a + * client write operation to be rolled back. + * + * Type 0 - Tests rollback. Drop a write I/O to a shard, then simulate an OSD + * down to force rollback to occur, lastly fail the retried write from the + * client so the results of the rollback can be inspected. + * + * Type 1 - Drop a write I/O to a shard. Used on its own this will hang a + * write I/O. + * + * Type 2 - Simulate an OSD down (ceph osd down) to force a new epoch. Usually + * used together with type 1 to force a rollback + * + * Type 3 - Abort when an OSD processes a write I/O to a shard. Typically the + * client write will be commited while the OSD is absent which will result in + * recovery or backfill later when the OSD returns. + * + * @brief Set up a write error inject for an object in an EC pool. + * @param o Target object for the error inject. + * @param when Error inject starts after this many object store reads. + * @param duration Error inject affects this many object store reads. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Result of configuring the error inect. + */ +std::string ec_inject_write_error(const ghobject_t& o, + const int64_t type, + const int64_t when, + const int64_t duration) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures; + ghobject_t os = o; + bool no_shard = true; + std::string result; + switch (type) { + case 0: + failures = &ec_inject_write_failures0; + result = "ok - drop write, sim OSD down and fail client retry with EINVAL"; + break; + case 1: + failures = &ec_inject_write_failures1; + no_shard = false; + result = "ok - drop write to shard"; + break; + case 2: + failures = &ec_inject_write_failures2; + result = "ok - inject OSD down"; + break; + case 3: + if (duration != 1) { + return "duration must be 1"; + } + failures = &ec_inject_write_failures3; + result = "ok - write abort OSDs"; + break; + default: + return "unrecognized error inject type"; + } + if (no_shard) { + os.set_shard(shard_id_t::NO_SHARD); + } + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + (*failures)[os] = std::pair(when, duration); + if (type == 0) { + ec_inject_write_failures0_shard[os] = o.shard_id; + } + return result; +} + +/** + * @brief Clear a previously configured read error inject. + * @param o Target object for the error inject. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Indication of how many errors were cleared. + */ +std::string ec_inject_clear_read_error(const ghobject_t& o, + const int64_t type) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures; + ghobject_t os = o; + int64_t remaining = 0; + switch (type) { + case 0: + failures = &ec_inject_read_failures0; + break; + case 1: + failures = &ec_inject_read_failures1; + break; + default: + return "unrecognized error inject type"; + } + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + auto it = failures->find(os); + if (it != failures->end()) { + remaining = it->second.second; + failures->erase(it); + } + if (remaining == 0) { + return "no outstanding error injects"; + } else if (remaining == 1) { + return "ok - 1 inject cleared"; + } + return "ok - " + std::to_string(remaining) + " injects cleared"; +} + +/** + * @brief Clear a previously configured write error inject. + * @param o Target object for the error inject. + * @param type Type of error inject 0 = EIO, 1 = missing shard. + * @return string Indication of how many errors were cleared. + */ +std::string ec_inject_clear_write_error(const ghobject_t& o, + const int64_t type) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures; + ghobject_t os = o; + bool no_shard = true; + int64_t remaining = 0; + switch (type) { + case 0: + failures = &ec_inject_write_failures0; + break; + case 1: + failures = &ec_inject_write_failures1; + no_shard = false; + break; + case 2: + failures = &ec_inject_write_failures2; + break; + case 3: + failures = &ec_inject_write_failures3; + break; + default: + return "unrecognized error inject type"; + } + if (no_shard) { + os.set_shard(shard_id_t::NO_SHARD); + } + if (os.hobj.oid.name == "*") { + os.hobj.set_hash(0); + } + auto it = failures->find(os); + if (it != failures->end()) { + remaining = it->second.second; + failures->erase(it); + if (type == 0) { + ec_inject_write_failures0_shard.erase(os); + } + } + if (remaining == 0) { + return "no outstanding error injects"; + } else if (remaining == 1) { + return "ok - 1 inject cleared"; + } + return "ok - " + std::to_string(remaining) + " injects cleared"; +} + +static bool ec_inject_test_error(const ghobject_t& o, + std::map<ghobject_t,std::pair<int64_t,int64_t>> *failures) +{ + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + auto it = failures->find(o); + if (it == failures->end()) { + ghobject_t os = o; + os.hobj.oid.name = "*"; + os.hobj.set_hash(0); + it = failures->find(os); + } + if (it != failures->end()) { + auto && [when,duration] = it->second; + if (when > 0) { + when--; + return false; + } + if (--duration <= 0) { + failures->erase(it); + } + return true; + } + return false; +} + +bool ec_inject_test_read_error0(const ghobject_t& o) +{ + return ec_inject_test_error(o, &ec_inject_read_failures0); +} + +bool ec_inject_test_read_error1(const ghobject_t& o) +{ + return ec_inject_test_error(o, &ec_inject_read_failures1); +} + +bool ec_inject_test_write_error0(const hobject_t& o, + const osd_reqid_t& reqid) { + std::lock_guard<ceph::recursive_mutex> l(ec_inject_lock); + ghobject_t os = ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD); + if (ec_inject_write_failures0_reqid.count(reqid)) { + // Matched reqid of retried write - flag for failure + ec_inject_write_failures0_reqid.erase(reqid); + return true; + } + auto it = ec_inject_write_failures0.find(os); + if (it == ec_inject_write_failures0.end()) { + os.hobj.oid.name = "*"; + os.hobj.set_hash(0); + it = ec_inject_write_failures0.find(os); + } + if (it != ec_inject_write_failures0.end()) { + auto && [when, duration] = it->second; + auto shard = ec_inject_write_failures0_shard.find(os)->second; + if (when > 0) { + when--; + } else { + if (--duration <= 0) { + ec_inject_write_failures0.erase(it); + ec_inject_write_failures0_shard.erase(os); + } + // Error inject triggered - save reqid + ec_inject_write_failures0_reqid.insert(reqid); + // Set up error inject to drop message to primary + ec_inject_write_error(ghobject_t(o, ghobject_t::NO_GEN, shard), 1, 0, 1); + } + } + return false; +} + +bool ec_inject_test_write_error1(const ghobject_t& o) { + bool rc = ec_inject_test_error(o, &ec_inject_write_failures1); + if (rc) { + // Set up error inject to generate OSD down + ec_inject_write_error(o, 2, 0, 1); + } + return rc; +} + +bool ec_inject_test_write_error2(const hobject_t& o) { + return ec_inject_test_error( + ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + &ec_inject_write_failures2); +} + +bool ec_inject_test_write_error3(const hobject_t& o) { + return ec_inject_test_error( + ghobject_t(o, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), + &ec_inject_write_failures3); +} diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index 7ff9cae7646..de4c11ad50f 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -493,6 +493,7 @@ struct ECCommon { ); ///< @return error code, 0 on success void schedule_recovery_work(); + }; /** @@ -843,3 +844,15 @@ void ECCommon::ReadPipeline::filter_read_op( on_schedule_recovery(op); } } + +// Error inject interfaces +std::string ec_inject_read_error(const ghobject_t& o, const int64_t type, const int64_t when, const int64_t duration); +std::string ec_inject_write_error(const ghobject_t& o, const int64_t type, const int64_t when, const int64_t duration); +std::string ec_inject_clear_read_error(const ghobject_t& o, const int64_t type); +std::string ec_inject_clear_write_error(const ghobject_t& o, const int64_t type); +bool ec_inject_test_read_error0(const ghobject_t& o); +bool ec_inject_test_read_error1(const ghobject_t& o); +bool ec_inject_test_write_error0(const hobject_t& o,const osd_reqid_t& reqid); +bool ec_inject_test_write_error1(const ghobject_t& o); +bool ec_inject_test_write_error2(const hobject_t& o); +bool ec_inject_test_write_error3(const hobject_t& o); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 5223eb283e9..9c9e540cf61 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -37,6 +37,7 @@ #include "osd/PG.h" #include "osd/scrubber/scrub_machine.h" #include "osd/scrubber/pg_scrubber.h" +#include "osd/ECCommon.h" #include "include/types.h" #include "include/compat.h" @@ -4348,6 +4349,46 @@ void OSD::final_init() "inject metadata error to an object"); ceph_assert(r == 0); r = admin_socket->register_command( + "injectecreaderr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false " \ + "name=when,type=CephInt,req=false " \ + "name=duration,type=CephInt,req=false", + test_ops_hook, + "inject error for read of object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectecclearreaderr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false", + test_ops_hook, + "clear read error injects for object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectecwriteerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false " \ + "name=when,type=CephInt,req=false " \ + "name=duration,type=CephInt,req=false", + test_ops_hook, + "inject error for write of object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( + "injectecclearwriteerr " \ + "name=pool,type=CephString " \ + "name=objname,type=CephObjectname " \ + "name=shardid,type=CephInt,req=true,range=0|255 " \ + "name=type,type=CephInt,req=false", + test_ops_hook, + "clear write error inject for object in an EC pool"); + ceph_assert(r == 0); + r = admin_socket->register_command( "set_recovery_delay " \ "name=utime,type=CephInt,req=false", test_ops_hook, @@ -6487,8 +6528,10 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, //directly request the osd make a change. if (command == "setomapval" || command == "rmomapkey" || command == "setomapheader" || command == "getomap" || - command == "truncobj" || command == "injectmdataerr" || - command == "injectdataerr" + command == "truncobj" || + command == "injectmdataerr" || command == "injectdataerr" || + command == "injectecreaderr" || command == "injectecclearreaderr" || + command == "injectecwriteerr" || command == "injectecclearwriteerr" ) { pg_t rawpg; int64_t pool; @@ -6527,8 +6570,21 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid))); spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid)); if (curmap->pg_is_ec(rawpg)) { - if ((command != "injectdataerr") && (command != "injectmdataerr")) { - ss << "Must not call on ec pool, except injectdataerr or injectmdataerr"; + if ((command != "injectdataerr") && + (command != "injectmdataerr") && + (command != "injectecreaderr") && + (command != "injectecclearreaderr") && + (command != "injectecwriteerr") && + (command != "injectecclearwriteerr")) { + ss << "Must not call on ec pool"; + return; + } + } else { + if ((command == "injectecreaderr") || + (command == "injecteclearreaderr") || + (command == "injectecwriteerr") || + (command == "injecteclearwriteerr")) { + ss << "Only supported on ec pool"; return; } } @@ -6607,6 +6663,38 @@ void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store, } else if (command == "injectmdataerr") { store->inject_mdata_error(gobj); ss << "ok"; + } else if (command == "injectecreaderr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + int64_t when = cmd_getval_or<int64_t>(cmdmap, "when", 0); + int64_t duration = cmd_getval_or<int64_t>(cmdmap, "duration", 1); + ss << ec_inject_read_error(gobj, type, when, duration); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } + } else if (command == "injectecclearreaderr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + ss << ec_inject_clear_read_error(gobj, type); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } + } else if (command == "injectecwriteerr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + int64_t when = cmd_getval_or<int64_t>(cmdmap, "when", 0); + int64_t duration = cmd_getval_or<int64_t>(cmdmap, "duration", 1); + ss << ec_inject_write_error(gobj, type, when, duration); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } + } else if (command == "injectecclearwriteerr") { + if (service->cct->_conf->bluestore_debug_inject_read_err) { + int64_t type = cmd_getval_or<int64_t>(cmdmap, "type", 0); + ss << ec_inject_clear_write_error(gobj, type); + } else { + ss << "bluestore_debug_inject_read_err not enabled"; + } } return; } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index b87484c1a9d..9b3593d54e5 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1642,12 +1642,10 @@ void OSDMap::get_out_of_subnet_osd_counts(CephContext *cct, for (int i = 0; i < max_osd; i++) { if (exists(i) && is_up(i)) { if (const auto& addrs = get_addrs(i).v; addrs.size() >= 2) { - auto v1_addr = addrs[0].ip_only_to_str(); - if (!is_addr_in_subnet(cct, public_network, v1_addr)) { + if (!is_addr_in_subnet(cct, public_network, addrs[0])) { unreachable->emplace(i); } - auto v2_addr = addrs[1].ip_only_to_str(); - if (!is_addr_in_subnet(cct, public_network, v2_addr)) { + if (!is_addr_in_subnet(cct, public_network, addrs[1])) { unreachable->emplace(i); } } diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index b87aa1da677..f5eb9ea951e 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -290,6 +290,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef; MessageRef, Connection *con) = 0; virtual void send_message_osd_cluster( Message *m, const ConnectionRef& con) = 0; + virtual void start_mon_command( + const std::vector<std::string>& cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs, + Context *onfinish) = 0; virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0; virtual entity_name_t get_cluster_msgr_name() = 0; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 44f8e85b5ef..3324ba9dc91 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -2286,6 +2286,16 @@ void PrimaryLogPG::do_op(OpRequestRef& op) } } + if (cct->_conf->bluestore_debug_inject_read_err && + op->may_write() && + pool.info.is_erasure() && + ec_inject_test_write_error0(m->get_hobj(), m->get_reqid())) { + // Fail retried write with error + dout(0) << __func__ << " Error inject - Fail retried write with EINVAL" << dendl; + osd->reply_op_error(op, -EINVAL); + return; + } + ObjectContextRef obc; bool can_create = op->may_write(); hobject_t missing_oid; @@ -5798,10 +5808,19 @@ int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op) int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl) { - for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) { - char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0); - if (osd_op.indata[idx] != read_byte) { - return (-MAX_ERRNO - idx); + auto input_iter = osd_op.indata.begin(); + auto read_iter = read_bl.begin(); + uint64_t idx = 0; + + while (input_iter != osd_op.indata.end()) { + char read_byte = (read_iter != read_bl.end() ? *read_iter : 0); + if (*input_iter != read_byte) { + return (-MAX_ERRNO - idx); + } + ++idx; + ++input_iter; + if (read_iter != read_bl.end()) { + ++read_iter; } } @@ -7767,27 +7786,34 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) bool truncated = false; bufferlist bl; if (oi.is_omap()) { - ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( - ch, ghobject_t(soid) - ); - if (!iter) { - result = -ENOENT; - goto fail; - } - iter->upper_bound(start_after); - if (filter_prefix > start_after) iter->lower_bound(filter_prefix); - for (num = 0; - iter->valid() && - iter->key().substr(0, filter_prefix.size()) == filter_prefix; - ++num, iter->next()) { - dout(20) << "Found key " << iter->key() << dendl; - if (num >= max_return || - bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { - truncated = true; - break; - } - encode(iter->key(), bl); - encode(iter->value(), bl); + using omap_iter_seek_t = ObjectStore::omap_iter_seek_t; + result = osd->store->omap_iterate( + ch, ghobject_t(soid), + // try to seek as many keys-at-once as possible for the sake of performance. + // note complexity should be logarithmic, so seek(n/2) + seek(n/2) is worse + // than just seek(n). + ObjectStore::omap_iter_seek_t{ + .seek_position = std::max(start_after, filter_prefix), + .seek_type = filter_prefix > start_after ? omap_iter_seek_t::LOWER_BOUND + : omap_iter_seek_t::UPPER_BOUND + }, + [&bl, &truncated, &filter_prefix, &num, max_return, + max_bytes=cct->_conf->osd_max_omap_bytes_per_request] + (std::string_view key, std::string_view value) mutable { + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + return ObjectStore::omap_iter_ret_t::STOP; + } + if (num >= max_return || bl.length() >= max_bytes) { + truncated = true; + return ObjectStore::omap_iter_ret_t::STOP; + } + encode(key, bl); + encode(value, bl); + ++num; + return ObjectStore::omap_iter_ret_t::NEXT; + }); + if (result < 0) { + goto fail; } } // else return empty out_set encode(num, osd_op.outdata); diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index f66b5c6e16a..bf55d539821 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -622,6 +622,12 @@ public: Message *m, const ConnectionRef& con) override { osd->send_message_osd_cluster(m, con); } + void start_mon_command( + const std::vector<std::string>& cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs, + Context *onfinish) override { + osd->monc->start_mon_command(cmd, inbl, outbl, outs, onfinish); + } ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override; entity_name_t get_cluster_msgr_name() override { return osd->get_cluster_msgr_name(); @@ -1993,6 +1999,7 @@ public: private: DynamicPerfStats m_dynamic_perf_stats; + }; inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop) @@ -2021,5 +2028,4 @@ inline ostream& operator<<(ostream& out, void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop); void intrusive_ptr_release(PrimaryLogPG::RepGather *repop); - #endif diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index f1c56d75378..550604fc55b 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -2036,8 +2036,8 @@ class CertKeyStore(): var = service_name if entity in self.service_name_cert else host j = {} self.known_certs[entity][var] = cert_obj - for service_name in self.known_certs[entity].keys(): - j[var] = Cert.to_json(self.known_certs[entity][var]) + for cert_key in self.known_certs[entity]: + j[cert_key] = Cert.to_json(self.known_certs[entity][cert_key]) else: self.known_certs[entity] = cert_obj j = Cert.to_json(cert_obj) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index bf14f8d1715..6690153d435 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2460,7 +2460,7 @@ Then run the following: @handle_orch_error def service_action(self, action: str, service_name: str) -> List[str]: - if service_name not in self.spec_store.all_specs.keys(): + if service_name not in self.spec_store.all_specs.keys() and service_name != 'osd': raise OrchestratorError(f'Invalid service name "{service_name}".' + ' View currently running services using "ceph orch ls"') dds: List[DaemonDescription] = self.cache.get_daemons_by_service(service_name) @@ -3925,6 +3925,50 @@ Then run the following: return self.to_remove_osds.all_osds() @handle_orch_error + def set_osd_spec(self, service_name: str, osd_ids: List[str]) -> str: + """ + Update unit.meta file for osd with service name + """ + if service_name not in self.spec_store: + raise OrchestratorError(f"Cannot find service '{service_name}' in the inventory. " + "Please try again after applying an OSD service that matches " + "the service name to which you want to attach OSDs.") + + daemons: List[orchestrator.DaemonDescription] = self.cache.get_daemons_by_type('osd') + update_osd = defaultdict(list) + for daemon in daemons: + if daemon.daemon_id in osd_ids and daemon.hostname: + update_osd[daemon.hostname].append(daemon.daemon_id) + + if not update_osd: + raise OrchestratorError(f"Unable to find OSDs: {osd_ids}") + + failed_osds = [] + success_osds = [] + for host in update_osd: + osds = ",".join(update_osd[host]) + # run cephadm command with all host osds on specific host, + # if it fails, continue with other hosts + try: + with self.async_timeout_handler(host): + outs, errs, _code = self.wait_async( + CephadmServe(self)._run_cephadm(host, + cephadmNoImage, + 'update-osd-service', + ['--service-name', service_name, '--osd-ids', osds])) + if _code: + self.log.error(f"Failed to update service for {osds} osd. Cephadm error: {errs}") + failed_osds.extend(update_osd[host]) + else: + success_osds.extend(update_osd[host]) + except Exception: + self.log.exception(f"Failed to set service name for {osds}") + failed_osds.extend(update_osd[host]) + self.cache.invalidate_host_daemons(host) + self._kick_serve_loop() + return f"Updated service for osd {','.join(success_osds)}" + (f" and failed for {','.join(failed_osds)}" if failed_osds else "") + + @handle_orch_error @host_exists() def drain_host(self, hostname: str, force: bool = False, keep_conf_keyring: bool = False, zap_osd_devices: bool = False) -> str: """ diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py index 98d2fe99897..04d3712c50a 100644 --- a/src/pybind/mgr/cephadm/schedule.py +++ b/src/pybind/mgr/cephadm/schedule.py @@ -385,6 +385,8 @@ class HostAssignment(object): def find_ip_on_host(self, hostname: str, subnets: List[str]) -> Optional[str]: for subnet in subnets: + # to normalize subnet + subnet = str(ipaddress.ip_network(subnet)) ips: List[str] = [] # following is to allow loopback interfaces for both ipv4 and ipv6. Since we # only have the subnet (and no IP) we assume default loopback IP address. diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 04f5af28a9b..4f83d7bb0fb 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -1157,6 +1157,14 @@ class RgwService(CephService): 'value': str(spec.rgw_bucket_counters_cache_size), }) + if getattr(spec, 'disable_multisite_sync_traffic', None) is not None: + ret, out, err = self.mgr.check_mon_command({ + 'prefix': 'config set', + 'who': daemon_name, + 'name': 'rgw_run_sync_thread', + 'value': 'false' if spec.disable_multisite_sync_traffic else 'true', + }) + daemon_spec.keyring = keyring daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec) diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 1b9cf618570..9c5b5a112f3 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -3,6 +3,7 @@ import logging import os import socket from typing import List, Any, Tuple, Dict, Optional, cast +import ipaddress from mgr_module import HandleCommandResult @@ -57,6 +58,8 @@ class GrafanaService(CephadmService): if ip_to_bind_to: daemon_spec.port_ips = {str(grafana_port): ip_to_bind_to} grafana_ip = ip_to_bind_to + if ipaddress.ip_network(grafana_ip).version == 6: + grafana_ip = f"[{grafana_ip}]" domain = self.mgr.get_fqdn(daemon_spec.host) mgmt_gw_ips = [] @@ -354,6 +357,13 @@ class AlertmanagerService(CephadmService): addr = self.mgr.get_fqdn(dd.hostname) peers.append(build_url(host=addr, port=port).lstrip('/')) + ip_to_bind_to = '' + if spec.only_bind_port_on_networks and spec.networks: + assert daemon_spec.host is not None + ip_to_bind_to = self.mgr.get_first_matching_network_ip(daemon_spec.host, spec) or '' + if ip_to_bind_to: + daemon_spec.port_ips = {str(port): ip_to_bind_to} + deps.append(f'secure_monitoring_stack:{self.mgr.secure_monitoring_stack}') if security_enabled: alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() @@ -376,7 +386,8 @@ class AlertmanagerService(CephadmService): }, 'peers': peers, 'web_config': '/etc/alertmanager/web.yml', - 'use_url_prefix': mgmt_gw_enabled + 'use_url_prefix': mgmt_gw_enabled, + 'ip_to_bind_to': ip_to_bind_to }, sorted(deps) else: return { @@ -384,7 +395,8 @@ class AlertmanagerService(CephadmService): "alertmanager.yml": yml }, "peers": peers, - 'use_url_prefix': mgmt_gw_enabled + 'use_url_prefix': mgmt_gw_enabled, + 'ip_to_bind_to': ip_to_bind_to }, sorted(deps) def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription: diff --git a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 index de993cb6ce3..b6955caf616 100644 --- a/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/alertmanager/alertmanager.yml.j2 @@ -8,6 +8,8 @@ global: tls_config: {% if security_enabled %} ca_file: root_cert.pem + cert_file: alertmanager.crt + key_file: alertmanager.key {% else %} insecure_skip_verify: true {% endif %} diff --git a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 index b9773ceeeb3..14af0fd48ca 100644 --- a/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/mgmt-gateway/nginx.conf.j2 @@ -9,6 +9,7 @@ events { http { #access_log /dev/stdout; + error_log /dev/stderr info; client_header_buffer_size 32K; large_client_header_buffers 4 32k; proxy_busy_buffers_size 512k; diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index ecfd899af71..961da145dac 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -28,6 +28,8 @@ alerting: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} - scheme: http http_sd_configs: @@ -56,6 +58,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} honor_labels: true http_sd_configs: @@ -81,6 +85,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ node_exporter_sd_url }} @@ -104,6 +110,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ haproxy_sd_url }} @@ -128,6 +136,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} honor_labels: true http_sd_configs: @@ -149,6 +159,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ nvmeof_sd_url }} @@ -169,6 +181,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ nfs_sd_url }} @@ -189,6 +203,8 @@ scrape_configs: password: {{ service_discovery_password }} tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key {% else %} http_sd_configs: - url: {{ smb_sd_url }} diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index b81510504d9..22bd26def91 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1741,16 +1741,23 @@ class TestCephadm(object): nvmeof_client_cert = 'fake-nvmeof-client-cert' nvmeof_server_cert = 'fake-nvmeof-server-cert' nvmeof_root_ca_cert = 'fake-nvmeof-root-ca-cert' + grafana_cert_host_1 = 'grafana-cert-host-1' + grafana_cert_host_2 = 'grafana-cert-host-2' cephadm_module.cert_key_store.save_cert('rgw_frontend_ssl_cert', rgw_frontend_rgw_foo_host2_cert, service_name='rgw.foo', user_made=True) cephadm_module.cert_key_store.save_cert('nvmeof_server_cert', nvmeof_server_cert, service_name='nvmeof.foo', user_made=True) cephadm_module.cert_key_store.save_cert('nvmeof_client_cert', nvmeof_client_cert, service_name='nvmeof.foo', user_made=True) cephadm_module.cert_key_store.save_cert('nvmeof_root_ca_cert', nvmeof_root_ca_cert, service_name='nvmeof.foo', user_made=True) + cephadm_module.cert_key_store.save_cert('grafana_cert', grafana_cert_host_1, host='host-1', user_made=True) + cephadm_module.cert_key_store.save_cert('grafana_cert', grafana_cert_host_2, host='host-2', user_made=True) expected_calls = [ mock.call(f'{CERT_STORE_CERT_PREFIX}rgw_frontend_ssl_cert', json.dumps({'rgw.foo': Cert(rgw_frontend_rgw_foo_host2_cert, True).to_json()})), mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_server_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_server_cert, True).to_json()})), mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_client_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_client_cert, True).to_json()})), mock.call(f'{CERT_STORE_CERT_PREFIX}nvmeof_root_ca_cert', json.dumps({'nvmeof.foo': Cert(nvmeof_root_ca_cert, True).to_json()})), + mock.call(f'{CERT_STORE_CERT_PREFIX}grafana_cert', json.dumps({'host-1': Cert(grafana_cert_host_1, True).to_json()})), + mock.call(f'{CERT_STORE_CERT_PREFIX}grafana_cert', json.dumps({'host-1': Cert(grafana_cert_host_1, True).to_json(), + 'host-2': Cert(grafana_cert_host_2, True).to_json()})) ] _set_store.assert_has_calls(expected_calls) @@ -1795,17 +1802,20 @@ class TestCephadm(object): cephadm_module.cert_key_store._init_known_cert_key_dicts() grafana_host1_key = 'fake-grafana-host1-key' + grafana_host2_key = 'fake-grafana-host2-key' nvmeof_client_key = 'nvmeof-client-key' nvmeof_server_key = 'nvmeof-server-key' nvmeof_encryption_key = 'nvmeof-encryption-key' - grafana_host1_key = 'fake-grafana-host1-cert' cephadm_module.cert_key_store.save_key('grafana_key', grafana_host1_key, host='host1') + cephadm_module.cert_key_store.save_key('grafana_key', grafana_host2_key, host='host2') cephadm_module.cert_key_store.save_key('nvmeof_client_key', nvmeof_client_key, service_name='nvmeof.foo') cephadm_module.cert_key_store.save_key('nvmeof_server_key', nvmeof_server_key, service_name='nvmeof.foo') cephadm_module.cert_key_store.save_key('nvmeof_encryption_key', nvmeof_encryption_key, service_name='nvmeof.foo') expected_calls = [ mock.call(f'{CERT_STORE_KEY_PREFIX}grafana_key', json.dumps({'host1': PrivKey(grafana_host1_key).to_json()})), + mock.call(f'{CERT_STORE_KEY_PREFIX}grafana_key', json.dumps({'host1': PrivKey(grafana_host1_key).to_json(), + 'host2': PrivKey(grafana_host2_key).to_json()})), mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_client_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_client_key).to_json()})), mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_server_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_server_key).to_json()})), mock.call(f'{CERT_STORE_KEY_PREFIX}nvmeof_encryption_key', json.dumps({'nvmeof.foo': PrivKey(nvmeof_encryption_key).to_json()})), diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 0d89657ac8c..d872219df80 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -581,7 +581,14 @@ class TestMonitoring: mock_getfqdn.return_value = purl.hostname with with_host(cephadm_module, "test"): - with with_service(cephadm_module, AlertManagerSpec()): + cephadm_module.cache.update_host_networks('test', { + '1.2.3.0/24': { + 'if0': ['1.2.3.1'] + }, + }) + with with_service(cephadm_module, AlertManagerSpec('alertmanager', + networks=['1.2.3.0/24'], + only_bind_port_on_networks=True)): y = dedent(self._get_config(expected_yaml_url)).lstrip() _run_cephadm.assert_called_with( 'test', @@ -595,11 +602,12 @@ class TestMonitoring: "deploy_arguments": [], "params": { 'tcp_ports': [9093, 9094], + 'port_ips': {"9094": "1.2.3.1"}, }, "meta": { 'service_name': 'alertmanager', 'ports': [9093, 9094], - 'ip': None, + 'ip': '1.2.3.1', 'deployed_by': [], 'rank': None, 'rank_generation': None, @@ -612,6 +620,7 @@ class TestMonitoring: }, "peers": [], "use_url_prefix": False, + "ip_to_bind_to": "1.2.3.1", } }), error_ok=True, @@ -634,8 +643,16 @@ class TestMonitoring: cephadm_module.secure_monitoring_stack = True cephadm_module.set_store(AlertmanagerService.USER_CFG_KEY, 'alertmanager_user') cephadm_module.set_store(AlertmanagerService.PASS_CFG_KEY, 'alertmanager_plain_password') + + cephadm_module.cache.update_host_networks('test', { + 'fd12:3456:789a::/64': { + 'if0': ['fd12:3456:789a::10'] + }, + }) with with_service(cephadm_module, MgmtGatewaySpec("mgmt-gateway")) as _, \ - with_service(cephadm_module, AlertManagerSpec()): + with_service(cephadm_module, AlertManagerSpec('alertmanager', + networks=['fd12:3456:789a::/64'], + only_bind_port_on_networks=True)): y = dedent(""" # This file is generated by cephadm. @@ -646,6 +663,8 @@ class TestMonitoring: http_config: tls_config: ca_file: root_cert.pem + cert_file: alertmanager.crt + key_file: alertmanager.key route: receiver: 'default' @@ -686,11 +705,12 @@ class TestMonitoring: "deploy_arguments": [], "params": { 'tcp_ports': [9093, 9094], + 'port_ips': {"9094": "fd12:3456:789a::10"} }, "meta": { 'service_name': 'alertmanager', 'ports': [9093, 9094], - 'ip': None, + 'ip': 'fd12:3456:789a::10', 'deployed_by': [], 'rank': None, 'rank_generation': None, @@ -708,6 +728,7 @@ class TestMonitoring: 'peers': [], 'web_config': '/etc/alertmanager/web.yml', "use_url_prefix": True, + "ip_to_bind_to": "fd12:3456:789a::10", } }), error_ok=True, @@ -741,6 +762,8 @@ class TestMonitoring: http_config: tls_config: ca_file: root_cert.pem + cert_file: alertmanager.crt + key_file: alertmanager.key route: receiver: 'default' @@ -801,6 +824,7 @@ class TestMonitoring: 'peers': [], 'web_config': '/etc/alertmanager/web.yml', "use_url_prefix": False, + "ip_to_bind_to": "", } }), error_ok=True, @@ -1170,6 +1194,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key scrape_configs: - job_name: 'ceph' @@ -1191,6 +1217,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'node' relabel_configs: @@ -1209,6 +1237,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'haproxy' relabel_configs: @@ -1225,6 +1255,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'ceph-exporter' relabel_configs: @@ -1242,6 +1274,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'nvmeof' honor_labels: true @@ -1255,6 +1289,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'nfs' honor_labels: true @@ -1268,6 +1304,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key - job_name: 'smb' honor_labels: true @@ -1281,6 +1319,8 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + cert_file: prometheus.crt + key_file: prometheus.key """).lstrip() @@ -2071,6 +2111,26 @@ class TestRGWService: }) assert f == expected + @pytest.mark.parametrize( + "disable_sync_traffic", + [ + (True), + (False), + ] + ) + @patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) + def test_rgw_disable_sync_traffic(self, disable_sync_traffic, cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'host1'): + s = RGWSpec(service_id="foo", + disable_multisite_sync_traffic=disable_sync_traffic) + with with_service(cephadm_module, s) as dds: + _, f, _ = cephadm_module.check_mon_command({ + 'prefix': 'config get', + 'who': f'client.{dds[0]}', + 'key': 'rgw_run_sync_thread', + }) + assert f == ('false' if disable_sync_traffic else 'true') + class TestMonService: @@ -3874,6 +3934,7 @@ class TestMgmtGateway: http { #access_log /dev/stdout; + error_log /dev/stderr info; client_header_buffer_size 32K; large_client_header_buffers 4 32k; proxy_busy_buffers_size 512k; @@ -4121,6 +4182,7 @@ class TestMgmtGateway: http { #access_log /dev/stdout; + error_log /dev/stderr info; client_header_buffer_size 32K; large_client_header_buffers 4 32k; proxy_busy_buffers_size 512k; diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index a505801eea5..4fbc975ae9f 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -747,6 +747,10 @@ class Orchestrator(object): """ raise NotImplementedError() + def set_osd_spec(self, service_name: str, osd_ids: List[str]) -> OrchResult: + """ set service of osd """ + raise NotImplementedError() + def blink_device_light(self, ident_fault: str, on: bool, locations: List['DeviceLightLoc']) -> OrchResult[List[str]]: """ Instructs the orchestrator to enable or disable either the ident or the fault LED. diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 332bc75d862..d5a1bb3da2b 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1472,6 +1472,14 @@ Usage: return HandleCommandResult(stdout=out) + @_cli_write_command('orch osd set-spec-affinity') + def _osd_set_spec(self, service_name: str, osd_id: List[str]) -> HandleCommandResult: + """Set service spec affinity for osd""" + completion = self.set_osd_spec(service_name, osd_id) + res = raise_if_exception(completion) + + return HandleCommandResult(stdout=res) + @_cli_write_command('orch daemon add') def daemon_add_misc(self, daemon_type: Optional[ServiceType] = None, @@ -1666,7 +1674,13 @@ Usage: specs: List[Union[ServiceSpec, HostSpec]] = [] # YAML '---' document separator with no content generates # None entries in the output. Let's skip them silently. - content = [o for o in yaml_objs if o is not None] + try: + content = [o for o in yaml_objs if o is not None] + except yaml.scanner.ScannerError as e: + msg = f"Invalid YAML received : {str(e)}" + self.log.exception(msg) + return HandleCommandResult(-errno.EINVAL, stderr=msg) + for s in content: try: spec = json_to_generic_spec(s) @@ -2191,7 +2205,13 @@ Usage: specs: List[TunedProfileSpec] = [] # YAML '---' document separator with no content generates # None entries in the output. Let's skip them silently. - content = [o for o in yaml_objs if o is not None] + try: + content = [o for o in yaml_objs if o is not None] + except yaml.scanner.ScannerError as e: + msg = f"Invalid YAML received : {str(e)}" + self.log.exception(msg) + return HandleCommandResult(-errno.EINVAL, stderr=msg) + for spec in content: specs.append(TunedProfileSpec.from_json(spec)) else: diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 8a2a38b86ee..1ac9fa49e32 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1231,6 +1231,7 @@ class RGWSpec(ServiceSpec): rgw_bucket_counters_cache: Optional[bool] = False, rgw_bucket_counters_cache_size: Optional[int] = None, generate_cert: bool = False, + disable_multisite_sync_traffic: Optional[bool] = None, ): assert service_type == 'rgw', service_type @@ -1283,6 +1284,8 @@ class RGWSpec(ServiceSpec): self.rgw_bucket_counters_cache_size = rgw_bucket_counters_cache_size #: Whether we should generate a cert/key for the user if not provided self.generate_cert = generate_cert + #: Used to make RGW not do multisite replication so it can dedicate to IO + self.disable_multisite_sync_traffic = disable_multisite_sync_traffic def get_port_start(self) -> List[int]: return [self.get_port()] @@ -2328,6 +2331,7 @@ class AlertManagerSpec(MonitoringSpec): user_data: Optional[Dict[str, Any]] = None, config: Optional[Dict[str, str]] = None, networks: Optional[List[str]] = None, + only_bind_port_on_networks: bool = False, port: Optional[int] = None, secure: bool = False, extra_container_args: Optional[GeneralArgList] = None, @@ -2358,6 +2362,7 @@ class AlertManagerSpec(MonitoringSpec): # <webhook_configs> configuration. self.user_data = user_data or {} self.secure = secure + self.only_bind_port_on_networks = only_bind_port_on_networks def get_port_start(self) -> List[int]: return [self.get_port(), 9094] @@ -2404,7 +2409,7 @@ class GrafanaSpec(MonitoringSpec): self.protocol = protocol # whether ports daemons for this service bind to should - # bind to only hte networks listed in networks param, or + # bind to only the networks listed in networks param, or # to all networks. Defaults to false which is saying to bind # on all networks. self.only_bind_port_on_networks = only_bind_port_on_networks diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt index 3727c525ce7..41e473e23f0 100644 --- a/src/rgw/CMakeLists.txt +++ b/src/rgw/CMakeLists.txt @@ -487,9 +487,9 @@ target_link_libraries(radosgw PRIVATE install(TARGETS radosgw DESTINATION bin) set(radosgw_admin_srcs - rgw_admin.cc - rgw_sync_checkpoint.cc - rgw_orphan.cc) + radosgw-admin/radosgw-admin.cc + radosgw-admin/sync_checkpoint.cc + radosgw-admin/orphan.cc) # this is unsatisfying and hopefully temporary; ARROW should not be # part of radosgw_admin diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc index a87d88c4b85..92dd7afe2fb 100644 --- a/src/rgw/driver/daos/rgw_sal_daos.cc +++ b/src/rgw/driver/daos/rgw_sal_daos.cc @@ -858,8 +858,6 @@ bool DaosZone::is_writeable() { return true; } bool DaosZone::get_redirect_endpoint(std::string* endpoint) { return false; } -bool DaosZone::has_zonegroup_api(const std::string& api) const { return false; } - const std::string& DaosZone::get_current_period_id() { return current_period->get_id(); } diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h index e382fdb04ae..5515579a441 100644 --- a/src/rgw/driver/daos/rgw_sal_daos.h +++ b/src/rgw/driver/daos/rgw_sal_daos.h @@ -484,7 +484,6 @@ class DaosZone : public StoreZone { virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; diff --git a/src/rgw/driver/motr/rgw_sal_motr.cc b/src/rgw/driver/motr/rgw_sal_motr.cc index b999673ac18..463ea8c5b11 100644 --- a/src/rgw/driver/motr/rgw_sal_motr.cc +++ b/src/rgw/driver/motr/rgw_sal_motr.cc @@ -1111,11 +1111,6 @@ bool MotrZone::get_redirect_endpoint(std::string* endpoint) return false; } -bool MotrZone::has_zonegroup_api(const std::string& api) const -{ - return (zonegroup.group.api_name == api); -} - const std::string& MotrZone::get_current_period_id() { return current_period->get_id(); diff --git a/src/rgw/driver/motr/rgw_sal_motr.h b/src/rgw/driver/motr/rgw_sal_motr.h index f92074b9d94..0f99ae48e86 100644 --- a/src/rgw/driver/motr/rgw_sal_motr.h +++ b/src/rgw/driver/motr/rgw_sal_motr.h @@ -525,7 +525,6 @@ class MotrZone : public StoreZone { virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; } virtual const std::string& get_realm_name() { return realm->get_name(); } diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc index 1345468210f..9d76462baa0 100644 --- a/src/rgw/driver/posix/rgw_sal_posix.cc +++ b/src/rgw/driver/posix/rgw_sal_posix.cc @@ -2893,6 +2893,14 @@ int POSIXObject::copy_object(const ACLOwner& owner, return dobj->set_obj_attrs(dpp, &attrs, nullptr, y, rgw::sal::FLAG_LOG_OP); } +int POSIXObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) +{ + return -EOPNOTSUPP; +} + int POSIXObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh) { int ret = stat(dpp); diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h index 8ec72bbc1bc..bf3478ad6ab 100644 --- a/src/rgw/driver/posix/rgw_sal_posix.h +++ b/src/rgw/driver/posix/rgw_sal_posix.h @@ -653,6 +653,13 @@ public: const DoutPrefixProvider* dpp, optional_yield y) override; virtual RGWAccessControlPolicy& get_acl(void) override { return acls; } virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override; virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override; diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc index f18e8e46bc5..aacb9b6a09a 100644 --- a/src/rgw/driver/rados/rgw_period.cc +++ b/src/rgw/driver/rados/rgw_period.cc @@ -68,20 +68,6 @@ int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y) return ret; } -int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y) -{ - if (zonegroup.realm_id != realm_id) { - return 0; - } - int ret = period_map.update(zonegroup, cct); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl; - return ret; - } - - return store_info(dpp, false, y); -} - int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y) { auto zone_svc = sysobj_svc->get_zone_svc(); diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index 0b77bca1da7..69075c506f1 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -6962,13 +6962,13 @@ int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* octx, RGWBu } return 0; -} +} /* RGWRados::set_attrs() */ -static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, - RGWRados* store, RGWBucketInfo& bucket_info, - RGWObjectCtx* rctx, RGWObjManifest* manifest, - int part_num, int* parts_count, bool prefetch, - RGWObjState** pstate, RGWObjManifest** pmanifest) +int RGWRados::get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, + RGWRados* store, RGWBucketInfo& bucket_info, + RGWObjectCtx* rctx, RGWObjManifest* manifest, + int part_num, int* parts_count, bool prefetch, + RGWObjState** pstate, RGWObjManifest** pmanifest) { if (!manifest) { return -ERR_INVALID_PART; @@ -7047,6 +7047,9 @@ static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, // update the object size sm->state.size = part_manifest.get_obj_size(); + if (!sm->state.attrset.count(RGW_ATTR_COMPRESSION)) { + sm->state.accounted_size = sm->state.size; + } *pmanifest = &part_manifest; return 0; diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h index b24823b60dc..fe79916392f 100644 --- a/src/rgw/driver/rados/rgw_rados.h +++ b/src/rgw/driver/rados/rgw_rados.h @@ -1071,6 +1071,12 @@ public: }; // class RGWRados::Bucket::List }; // class RGWRados::Bucket + static int get_part_obj_state(const DoutPrefixProvider* dpp, optional_yield y, + RGWRados* store, RGWBucketInfo& bucket_info, + RGWObjectCtx* rctx, RGWObjManifest* manifest, + int part_num, int* parts_count, bool prefetch, + RGWObjState** pstate, RGWObjManifest** pmanifest); + int on_last_entry_in_listing(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const std::string& obj_prefix, diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc index 88da446c3de..4c67d0ee71a 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.cc +++ b/src/rgw/driver/rados/rgw_sal_rados.cc @@ -2471,7 +2471,108 @@ bool RadosObject::is_sync_completed(const DoutPrefixProvider* dpp, const rgw_bi_log_entry& earliest_marker = entries.front(); return earliest_marker.timestamp > obj_mtime; -} +} /* is_sync_completed */ + +int RadosObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) +{ + int ret{0}; + + /* require an object with a manifest, so call to get_obj_state() must precede this */ + if (! manifest) { + return -EINVAL; + } + + RGWObjManifest::obj_iterator end = manifest->obj_end(dpp); + if (end.get_cur_part_id() == 0) { // not multipart + ldpp_dout(dpp, 20) << __func__ << " object does not have a multipart manifest" + << dendl; + return 0; + } + + auto end_part_id = end.get_cur_part_id(); + auto parts_count = (end_part_id == 1) ? 1 : end_part_id - 1; + if (marker > (parts_count - 1)) { + return 0; + } + + RGWObjManifest::obj_iterator part_iter = manifest->obj_begin(dpp); + + if (marker != 0) { + ldpp_dout_fmt(dpp, 20, + "{} seeking to part #{} in the object manifest", + __func__, marker); + + part_iter = manifest->obj_find_part(dpp, marker + 1); + + if (part_iter == end) { + ldpp_dout_fmt(dpp, 5, + "{} failed to find part #{} in the object manifest", + __func__, marker + 1); + return 0; + } + } + + RGWObjectCtx& obj_ctx = get_ctx(); + RGWBucketInfo& bucket_info = get_bucket()->get_info(); + + Object::Part obj_part{}; + for (; part_iter != manifest->obj_end(dpp); ++part_iter) { + + /* we're only interested in the first object in each logical part */ + auto cur_part_id = part_iter.get_cur_part_id(); + if (cur_part_id == obj_part.part_number) { + continue; + } + + if (max_parts < 1) { + *truncated = true; + break; + } + + /* get_part_obj_state alters the passed manifest** to point to a part + * manifest, which we don't want to leak out here */ + RGWObjManifest* obj_m = manifest; + RGWObjState* astate; + bool part_prefetch = false; + ret = RGWRados::get_part_obj_state(dpp, y, store->getRados(), bucket_info, &obj_ctx, + obj_m, cur_part_id, &parts_count, + part_prefetch, &astate, &obj_m); + + if (ret < 0) { + ldpp_dout_fmt(dpp, 4, + "{} get_part_obj_state() failed ret={}", + __func__, ret); + break; + } + + obj_part.part_number = part_iter.get_cur_part_id(); + obj_part.part_size = astate->accounted_size; + + if (auto iter = astate->attrset.find(RGW_ATTR_CKSUM); + iter != astate->attrset.end()) { + try { + rgw::cksum::Cksum part_cksum; + auto ck_iter = iter->second.cbegin(); + part_cksum.decode(ck_iter); + obj_part.cksum = std::move(part_cksum); + } catch (buffer::error& err) { + ldpp_dout_fmt(dpp, 4, + "WARN: {} could not decode stored cksum, " + "caught buffer::error", + __func__); + } + } + + each_func(obj_part); + *next_marker = ++marker; + --max_parts; + } /* each part */ + + return ret; +} /* RadosObject::list_parts */ int RadosObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh) { @@ -4500,11 +4601,6 @@ bool RadosZone::get_redirect_endpoint(std::string* endpoint) return true; } -bool RadosZone::has_zonegroup_api(const std::string& api) const -{ - return store->svc()->zone->has_zonegroup_api(api); -} - const std::string& RadosZone::get_current_period_id() { return store->svc()->zone->get_current_period_id(); diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h index 23d81a934b0..85ea247e345 100644 --- a/src/rgw/driver/rados/rgw_sal_rados.h +++ b/src/rgw/driver/rados/rgw_sal_rados.h @@ -107,7 +107,6 @@ class RadosZone : public StoreZone { virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() override; virtual const std::string& get_realm_name() override; @@ -593,12 +592,18 @@ class RadosObject : public StoreObject { StoreObject::set_compressed(); } - virtual bool is_sync_completed(const DoutPrefixProvider* dpp, const ceph::real_time& obj_mtime) override; /* For rgw_admin.cc */ RGWObjState& get_state() { return state; } virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override; + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override; virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc index 79d2be0bcfa..bf7a309e864 100644 --- a/src/rgw/driver/rados/rgw_tools.cc +++ b/src/rgw/driver/rados/rgw_tools.cc @@ -339,21 +339,35 @@ int rgw_list_pool(const DoutPrefixProvider *dpp, ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl; return -EINVAL; } - - auto iter = ioctx.nobjects_begin(oc); + librados::NObjectIterator iter; + try { + iter = ioctx.nobjects_begin(oc); + } catch (const std::system_error& e) { + ldpp_dout(dpp, 1) << "rgw_list_pool: Failed to begin iteration of pool " + << ioctx.get_pool_name() << " with error " + << e.what() << dendl; + return ceph::from_error_code(e.code()); + } /// Pool_iterate if (iter == ioctx.nobjects_end()) return -ENOENT; - for (; oids->size() < max && iter != ioctx.nobjects_end(); ++iter) { - string oid = iter->get_oid(); - ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; + try { + for (; oids->size() < max && iter != ioctx.nobjects_end(); ++iter) { + string oid = iter->get_oid(); + ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl; - // fill it in with initial values; we may correct later - if (filter && !filter(oid, oid)) - continue; + // fill it in with initial values; we may correct later + if (filter && !filter(oid, oid)) + continue; - oids->push_back(oid); + oids->push_back(oid); + } + } catch (const std::system_error& e) { + ldpp_dout(dpp, 1) << "rgw_list_pool: Failed iterating pool " + << ioctx.get_pool_name() << " with error " + << e.what() << dendl; + return ceph::from_error_code(e.code()); } marker = iter.get_cursor().to_str(); diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h index ab157f38e39..4ae7d13eff7 100644 --- a/src/rgw/driver/rados/rgw_user.h +++ b/src/rgw/driver/rados/rgw_user.h @@ -19,11 +19,11 @@ #define RGW_USER_ANON_ID "anonymous" -#define SECRET_KEY_LEN 40 -#define PUBLIC_ID_LEN 20 -#define RAND_SUBUSER_LEN 5 +constexpr auto SECRET_KEY_LEN=40; +constexpr auto PUBLIC_ID_LEN=20; +constexpr auto RAND_SUBUSER_LEN=5; -#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/" +constexpr auto XMLNS_AWS_S3 = "http://s3.amazonaws.com/doc/2006-03-01/"; class RGWUserCtl; class RGWBucketCtl; diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h index c542abc76d6..5fb2b4b8096 100644 --- a/src/rgw/driver/rados/rgw_zone.h +++ b/src/rgw/driver/rados/rgw_zone.h @@ -769,7 +769,6 @@ public: int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true); int delete_obj(const DoutPrefixProvider *dpp, optional_yield y); int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y); - int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y); void fork(); int update(const DoutPrefixProvider *dpp, optional_yield y); diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/radosgw-admin/orphan.cc index b7dc562c721..9fca3b99a7c 100644 --- a/src/rgw/rgw_orphan.cc +++ b/src/rgw/radosgw-admin/orphan.cc @@ -1,6 +1,12 @@ + +/* + * Copyright (C) 2024 IBM +*/ + // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp +#include "radosgw-admin/orphan.h" #include <string> @@ -10,7 +16,6 @@ #include "rgw_op.h" #include "rgw_multi.h" -#include "rgw_orphan.h" #include "rgw_zone.h" #include "rgw_bucket.h" #include "rgw_sal_rados.h" diff --git a/src/rgw/rgw_orphan.h b/src/rgw/radosgw-admin/orphan.h index db811d31d9a..db811d31d9a 100644 --- a/src/rgw/rgw_orphan.h +++ b/src/rgw/radosgw-admin/orphan.h diff --git a/src/rgw/rgw_admin.cc b/src/rgw/radosgw-admin/radosgw-admin.cc index f2c1e81b540..182e42b8e31 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/radosgw-admin/radosgw-admin.cc @@ -1,12 +1,15 @@ +/* + * Copyright (C) 2025 IBM +*/ + // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab ft=cpp -#include <errno.h> -#include <iostream> -#include <sstream> +#include <cerrno> #include <string> - -#include <boost/optional.hpp> +#include <sstream> +#include <optional> +#include <iostream> extern "C" { #include <liboath/oath.h> @@ -38,6 +41,9 @@ extern "C" { #include "include/utime.h" #include "include/str_list.h" +#include "radosgw-admin/orphan.h" +#include "radosgw-admin/sync_checkpoint.h" + #include "rgw_user.h" #include "rgw_otp.h" #include "rgw_rados.h" @@ -48,7 +54,6 @@ extern "C" { #include "rgw_log.h" #include "rgw_formats.h" #include "rgw_usage.h" -#include "rgw_orphan.h" #include "rgw_sync.h" #include "rgw_trim_bilog.h" #include "rgw_trim_datalog.h" @@ -62,7 +67,6 @@ extern "C" { #include "rgw_zone.h" #include "rgw_pubsub.h" #include "rgw_bucket_sync.h" -#include "rgw_sync_checkpoint.h" #include "rgw_lua.h" #include "rgw_sal.h" #include "rgw_sal_config.h" @@ -82,11 +86,6 @@ extern "C" { #define dout_context g_ceph_context -#define SECRET_KEY_LEN 40 -#define PUBLIC_ID_LEN 20 - -using namespace std; - static rgw::sal::Driver* driver = NULL; static constexpr auto dout_subsys = ceph_subsys_rgw; @@ -117,19 +116,13 @@ static const DoutPrefixProvider* dpp() { } \ } while (0) -static inline int posix_errortrans(int r) +using namespace std; + +inline int posix_errortrans(int r) { - switch(r) { - case ERR_NO_SUCH_BUCKET: - r = ENOENT; - break; - default: - break; - } - return r; + return ERR_NO_SUCH_BUCKET == r ? ENOENT : r; } - static const std::string LUA_CONTEXT_LIST("prerequest, postrequest, background, getdata, putdata"); void usage() @@ -1272,7 +1265,7 @@ static int read_input(const string& infile, bufferlist& bl) } } -#define READ_CHUNK 8196 + constexpr auto READ_CHUNK=8196; int r; int err; diff --git a/src/rgw/rgw_sync_checkpoint.cc b/src/rgw/radosgw-admin/sync_checkpoint.cc index 1172e79a48f..0303ed6c747 100644 --- a/src/rgw/rgw_sync_checkpoint.cc +++ b/src/rgw/radosgw-admin/sync_checkpoint.cc @@ -5,6 +5,7 @@ * Ceph - scalable distributed file system * * Copyright (C) 2020 Red Hat, Inc. + * Copyright (C) 2024 IBM * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -13,9 +14,12 @@ * */ +#include "radosgw-admin/sync_checkpoint.h" + #include <fmt/format.h> + #include "common/errno.h" -#include "rgw_sync_checkpoint.h" + #include "rgw_sal_rados.h" #include "rgw_bucket_sync.h" #include "rgw_data_sync.h" diff --git a/src/rgw/rgw_sync_checkpoint.h b/src/rgw/radosgw-admin/sync_checkpoint.h index 28df68d8860..28df68d8860 100644 --- a/src/rgw/rgw_sync_checkpoint.h +++ b/src/rgw/radosgw-admin/sync_checkpoint.h diff --git a/src/rgw/rgw_cksum_pipe.cc b/src/rgw/rgw_cksum_pipe.cc index e06957e2715..0bec8d341af 100644 --- a/src/rgw/rgw_cksum_pipe.cc +++ b/src/rgw/rgw_cksum_pipe.cc @@ -18,6 +18,7 @@ #include <string> #include <fmt/format.h> #include <boost/algorithm/string.hpp> +#include "rgw_cksum.h" #include "rgw_common.h" #include "common/dout.h" #include "rgw_client_io.h" @@ -34,7 +35,8 @@ namespace rgw::putobj { {} std::unique_ptr<RGWPutObj_Cksum> RGWPutObj_Cksum::Factory( - rgw::sal::DataProcessor* next, const RGWEnv& env) + rgw::sal::DataProcessor* next, const RGWEnv& env, + rgw::cksum::Type override_type) { /* look for matching headers */ auto algo_header = cksum_algorithm_hdr(env); @@ -49,6 +51,13 @@ namespace rgw::putobj { throw rgw::io::Exception(EINVAL, std::system_category()); } /* no checksum header */ + if (override_type != rgw::cksum::Type::none) { + /* XXXX safe? do we need to fixup env as well? */ + auto algo_header = cksum_algorithm_hdr(override_type); + return + std::make_unique<RGWPutObj_Cksum>( + next, override_type, std::move(algo_header)); + } return std::unique_ptr<RGWPutObj_Cksum>(); } diff --git a/src/rgw/rgw_cksum_pipe.h b/src/rgw/rgw_cksum_pipe.h index fddcd283c84..c459d156335 100644 --- a/src/rgw/rgw_cksum_pipe.h +++ b/src/rgw/rgw_cksum_pipe.h @@ -20,6 +20,7 @@ #include <tuple> #include <cstring> #include <boost/algorithm/string/case_conv.hpp> +#include "rgw_cksum.h" #include "rgw_cksum_digest.h" #include "rgw_common.h" #include "rgw_putobj.h" @@ -29,6 +30,38 @@ namespace rgw::putobj { namespace cksum = rgw::cksum; using cksum_hdr_t = std::pair<const char*, const char*>; + static inline const cksum_hdr_t cksum_algorithm_hdr(rgw::cksum::Type t) { + static constexpr std::string_view hdr = + "HTTP_X_AMZ_SDK_CHECKSUM_ALGORITHM"; + using rgw::cksum::Type; + switch (t) { + case Type::sha256: + return cksum_hdr_t(hdr.data(), "SHA256"); + break; + case Type::crc32: + return cksum_hdr_t(hdr.data(), "CRC32"); + break; + case Type::crc32c: + return cksum_hdr_t(hdr.data(), "CRC32C"); + break; + case Type::xxh3: + return cksum_hdr_t(hdr.data(), "XX3"); + break; + case Type::sha1: + return cksum_hdr_t(hdr.data(), "SHA1"); + break; + case Type::sha512: + return cksum_hdr_t(hdr.data(), "SHA512"); + break; + case Type::blake3: + return cksum_hdr_t(hdr.data(), "BLAKE3"); + break; + default: + break; + }; + return cksum_hdr_t(nullptr, nullptr);; + } + static inline const cksum_hdr_t cksum_algorithm_hdr(const RGWEnv& env) { /* If the individual checksum value you provide through x-amz-checksum-algorithm doesn't match the checksum algorithm @@ -102,7 +135,8 @@ namespace rgw::putobj { using VerifyResult = std::tuple<bool, const cksum::Cksum&>; static std::unique_ptr<RGWPutObj_Cksum> Factory( - rgw::sal::DataProcessor* next, const RGWEnv&); + rgw::sal::DataProcessor* next, const RGWEnv&, + rgw::cksum::Type override_type); RGWPutObj_Cksum(rgw::sal::DataProcessor* next, rgw::cksum::Type _type, cksum_hdr_t&& _hdr); diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc index 97c46f12bd0..6610538542c 100644 --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@ -63,6 +63,7 @@ rgw_http_errors rgw_http_s3_errors({ { ERR_INVALID_DIGEST, {400, "InvalidDigest" }}, { ERR_BAD_DIGEST, {400, "BadDigest" }}, { ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }}, + { ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION, {400, "IllegalLocationConstraintException" }}, { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }}, { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }}, { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }}, diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index f0bd41494c3..d2917838f36 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -337,6 +337,7 @@ inline constexpr const char* RGW_REST_STS_XMLNS = #define ERR_PRESIGNED_URL_EXPIRED 2223 #define ERR_PRESIGNED_URL_DISABLED 2224 #define ERR_AUTHORIZATION 2225 // SNS 403 AuthorizationError +#define ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION 2226 #define ERR_BUSY_RESHARDING 2300 // also in cls_rgw_types.h, don't change! #define ERR_NO_SUCH_ENTITY 2301 diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc index 2a5c9cd313e..ef6761d4222 100644 --- a/src/rgw/rgw_iam_policy.cc +++ b/src/rgw/rgw_iam_policy.cc @@ -94,6 +94,8 @@ static const actpair actpairs[] = { "s3:GetPublicAccessBlock", s3GetPublicAccessBlock }, { "s3:GetObjectAcl", s3GetObjectAcl }, { "s3:GetObject", s3GetObject }, + { "s3:GetObjectAttributes", s3GetObjectAttributes }, + { "s3:GetObjectVersionAttributes", s3GetObjectVersionAttributes }, { "s3:GetObjectTorrent", s3GetObjectTorrent }, { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl }, { "s3:GetObjectVersion", s3GetObjectVersion }, @@ -1335,6 +1337,7 @@ const char* action_bit_string(uint64_t action) { case s3ListBucketVersions: return "s3:ListBucketVersions"; + case s3ListAllMyBuckets: return "s3:ListAllMyBuckets"; @@ -1479,6 +1482,12 @@ const char* action_bit_string(uint64_t action) { case s3BypassGovernanceRetention: return "s3:BypassGovernanceRetention"; + case s3GetObjectAttributes: + return "s3:GetObjectAttributes"; + + case s3GetObjectVersionAttributes: + return "s3:GetObjectVersionAttributes"; + case s3DescribeJob: return "s3:DescribeJob"; diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h index 0476926143f..dd323ee4b9c 100644 --- a/src/rgw/rgw_iam_policy.h +++ b/src/rgw/rgw_iam_policy.h @@ -115,6 +115,8 @@ enum { s3GetBucketEncryption, s3PutBucketEncryption, s3DescribeJob, + s3GetObjectAttributes, + s3GetObjectVersionAttributes, s3All, s3objectlambdaGetObject, @@ -247,6 +249,8 @@ inline int op_to_perm(std::uint64_t op) { case s3GetObjectVersionTagging: case s3GetObjectRetention: case s3GetObjectLegalHold: + case s3GetObjectAttributes: + case s3GetObjectVersionAttributes: case s3ListAllMyBuckets: case s3ListBucket: case s3ListBucketMultipartUploads: diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 5ce0033de47..7b0ca3134a3 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -25,8 +25,10 @@ #include "common/ceph_json.h" #include "common/static_ptr.h" #include "common/perf_counters_key.h" +#include "rgw_cksum.h" #include "rgw_cksum_digest.h" #include "rgw_common.h" +#include "common/split.h" #include "rgw_tracer.h" #include "rgw_rados.h" @@ -3558,54 +3560,62 @@ void RGWCreateBucket::execute(optional_yield y) const rgw::SiteConfig& site = *s->penv.site; const std::optional<RGWPeriod>& period = site.get_period(); const RGWZoneGroup& my_zonegroup = site.get_zonegroup(); - - if (s->system_request) { - // allow system requests to override the target zonegroup. for forwarded - // requests, we'll create the bucket for the originating zonegroup - createparams.zonegroup_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup"); - } - + const std::string rgwx_zonegroup = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup"); const RGWZoneGroup* bucket_zonegroup = &my_zonegroup; - if (createparams.zonegroup_id.empty()) { - // default to the local zonegroup - createparams.zonegroup_id = my_zonegroup.id; - } else if (period) { - auto z = period->period_map.zonegroups.find(createparams.zonegroup_id); - if (z == period->period_map.zonegroups.end()) { - ldpp_dout(this, 0) << "could not find zonegroup " - << createparams.zonegroup_id << " in current period" << dendl; - op_ret = -ENOENT; - return; - } - bucket_zonegroup = &z->second; - } else if (createparams.zonegroup_id != my_zonegroup.id) { - ldpp_dout(this, 0) << "zonegroup does not match current zonegroup " - << createparams.zonegroup_id << dendl; - op_ret = -ENOENT; - return; - } - // validate the LocationConstraint + // Validate LocationConstraint if it's provided and enforcement is strict if (!location_constraint.empty() && !relaxed_region_enforcement) { - // on the master zonegroup, allow any valid api_name. otherwise it has to - // match the bucket's zonegroup - if (period && my_zonegroup.is_master) { - if (!period->period_map.zonegroups_by_api.count(location_constraint)) { + if (period) { + auto location_iter = period->period_map.zonegroups_by_api.find(location_constraint); + if (location_iter == period->period_map.zonegroups_by_api.end()) { ldpp_dout(this, 0) << "location constraint (" << location_constraint << ") can't be found." << dendl; op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; - s->err.message = "The specified location-constraint is not valid"; + s->err.message = fmt::format("The {} location constraint is not valid.", + location_constraint); return; } - } else if (bucket_zonegroup->api_name != location_constraint) { + bucket_zonegroup = &location_iter->second; + } else if (location_constraint != my_zonegroup.api_name) { // if we don't have a period, we can only use the current zonegroup - so check if the location matches by api name here ldpp_dout(this, 0) << "location constraint (" << location_constraint - << ") doesn't match zonegroup (" << bucket_zonegroup->api_name - << ')' << dendl; - op_ret = -ERR_INVALID_LOCATION_CONSTRAINT; - s->err.message = "The specified location-constraint is not valid"; + << ") doesn't match zonegroup (" << my_zonegroup.api_name << ")" << dendl; + op_ret = -ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION; + s->err.message = fmt::format("The {} location constraint is incompatible " + "for the region specific endpoint this request was sent to.", + location_constraint); return; } } + // If it's a system request, use the provided zonegroup if available + else if (s->system_request && !rgwx_zonegroup.empty()) { + if (period) { + auto zonegroup_iter = period->period_map.zonegroups.find(rgwx_zonegroup); + if (zonegroup_iter == period->period_map.zonegroups.end()) { + ldpp_dout(this, 0) << "could not find zonegroup " << rgwx_zonegroup + << " in current period" << dendl; + op_ret = -ENOENT; + return; + } + bucket_zonegroup = &zonegroup_iter->second; + } + } + + const bool enforce_location_match = + !period || // No period: no multisite, so no need to enforce location match. + !s->system_request || // All user requests are enforced to match zonegroup's location. + !my_zonegroup.is_master; // but if it's a system request (forwarded) only allow remote creation on master zonegroup. + if (enforce_location_match && !my_zonegroup.equals(bucket_zonegroup->get_id())) { + ldpp_dout(this, 0) << "location constraint (" << bucket_zonegroup->api_name + << ") doesn't match zonegroup (" << my_zonegroup.api_name << ")" << dendl; + op_ret = -ERR_ILLEGAL_LOCATION_CONSTRAINT_EXCEPTION; + s->err.message = fmt::format("The {} location constraint is incompatible " + "for the region specific endpoint this request was sent to.", + bucket_zonegroup->api_name); + return; + } + + // Set the final zonegroup ID + createparams.zonegroup_id = bucket_zonegroup->id; // select and validate the placement target op_ret = select_bucket_placement(this, *bucket_zonegroup, s->user->get_info(), @@ -3614,7 +3624,7 @@ void RGWCreateBucket::execute(optional_yield y) return; } - if (bucket_zonegroup == &my_zonegroup) { + if (my_zonegroup.equals(bucket_zonegroup->get_id())) { // look up the zone placement pool createparams.zone_placement = rgw::find_zone_placement( this, site.get_zone_params(), createparams.placement_rule); @@ -3703,7 +3713,6 @@ void RGWCreateBucket::execute(optional_yield y) if (!driver->is_meta_master()) { // apply bucket creation on the master zone first - bufferlist in_data; JSONParser jp; op_ret = rgw_forward_request_to_master(this, *s->penv.site, s->owner.id, &in_data, &jp, s->info, y); @@ -3780,7 +3789,10 @@ void RGWCreateBucket::execute(optional_yield y) s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty(); /* This will also set the quota on the bucket. */ - op_ret = s->bucket->merge_and_store_attrs(this, createparams.attrs, y); + s->bucket->set_attrs(std::move(createparams.attrs)); + constexpr bool exclusive = false; // overwrite + constexpr ceph::real_time no_set_mtime{}; + op_ret = s->bucket->put_info(this, exclusive, no_set_mtime, y); } while (op_ret == -ECANCELED && tries++ < 20); /* Restore the proper return code. */ @@ -4331,6 +4343,9 @@ void RGWPutObj::execute(optional_yield y) } return; } + + multipart_cksum_type = upload->cksum_type; + /* upload will go out of scope, so copy the dest placement for later use */ s->dest_placement = *pdest_placement; pdest_placement = &s->dest_placement; @@ -4461,11 +4476,12 @@ void RGWPutObj::execute(optional_yield y) /* optional streaming checksum */ try { cksum_filter = - rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env); + rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env, multipart_cksum_type); } catch (const rgw::io::Exception& e) { op_ret = -e.code().value(); return; } + if (cksum_filter) { filter = &*cksum_filter; } @@ -4612,10 +4628,12 @@ void RGWPutObj::execute(optional_yield y) if (cksum_filter) { const auto& hdr = cksum_filter->header(); + auto expected_ck = cksum_filter->expected(*s->info.env); auto cksum_verify = cksum_filter->verify(*s->info.env); // valid or no supplied cksum cksum = get<1>(cksum_verify); - if (std::get<0>(cksum_verify)) { + if ((!expected_ck) || + std::get<0>(cksum_verify)) { buffer::list cksum_bl; ldpp_dout_fmt(this, 16, @@ -4623,14 +4641,13 @@ void RGWPutObj::execute(optional_yield y) "\n\tcomputed={} == \n\texpected={}", hdr.second, cksum->to_armor(), - cksum_filter->expected(*s->info.env)); + (!!expected_ck) ? expected_ck : "(checksum unavailable)"); cksum->encode(cksum_bl); emplace_attr(RGW_ATTR_CKSUM, std::move(cksum_bl)); } else { /* content checksum mismatch */ auto computed_ck = cksum->to_armor(); - auto expected_ck = cksum_filter->expected(*s->info.env); ldpp_dout_fmt(this, 4, "{} content checksum mismatch" @@ -4833,7 +4850,8 @@ void RGWPostObj::execute(optional_yield y) /* optional streaming checksum */ try { cksum_filter = - rgw::putobj::RGWPutObj_Cksum::Factory(filter, *s->info.env); + rgw::putobj::RGWPutObj_Cksum::Factory( + filter, *s->info.env, rgw::cksum::Type::none /* no override */); } catch (const rgw::io::Exception& e) { op_ret = -e.code().value(); return; @@ -5181,7 +5199,10 @@ void RGWPutMetadataBucket::execute(optional_yield y) /* Setting attributes also stores the provided bucket info. Due * to this fact, the new quota settings can be serialized with * the same call. */ - op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield); + s->bucket->set_attrs(attrs); + constexpr bool exclusive = false; // overwrite + constexpr ceph::real_time no_set_mtime{}; + op_ret = s->bucket->put_info(this, exclusive, no_set_mtime, s->yield); return op_ret; }, y); } @@ -5969,8 +5990,6 @@ void RGWGetACLs::execute(optional_yield y) acls = ss.str(); } - - int RGWPutACLs::verify_permission(optional_yield y) { bool perm; @@ -5992,6 +6011,74 @@ int RGWPutACLs::verify_permission(optional_yield y) return 0; } +uint16_t RGWGetObjAttrs::recognize_attrs(const std::string& hdr, uint16_t deflt) +{ + auto attrs{deflt}; + auto sa = ceph::split(hdr, ","); + for (auto& k : sa) { + if (boost::iequals(k, "etag")) { + attrs |= as_flag(ReqAttributes::Etag); + } + if (boost::iequals(k, "checksum")) { + attrs |= as_flag(ReqAttributes::Checksum); + } + if (boost::iequals(k, "objectparts")) { + attrs |= as_flag(ReqAttributes::ObjectParts); + } + if (boost::iequals(k, "objectsize")) { + attrs |= as_flag(ReqAttributes::ObjectSize); + } + if (boost::iequals(k, "storageclass")) { + attrs |= as_flag(ReqAttributes::StorageClass); + } + } + return attrs; +} /* RGWGetObjAttrs::recognize_attrs */ + +int RGWGetObjAttrs::verify_permission(optional_yield y) +{ + bool perm = false; + auto [has_s3_existing_tag, has_s3_resource_tag] = + rgw_check_policy_condition(this, s); + + if (! rgw::sal::Object::empty(s->object.get())) { + + auto iam_action1 = s->object->get_instance().empty() ? + rgw::IAM::s3GetObject : + rgw::IAM::s3GetObjectVersion; + + auto iam_action2 = s->object->get_instance().empty() ? + rgw::IAM::s3GetObjectAttributes : + rgw::IAM::s3GetObjectVersionAttributes; + + if (has_s3_existing_tag || has_s3_resource_tag) { + rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag); + } + + /* XXXX the following conjunction should be &&--but iam_action2 is currently not + * hooked up and always fails (but should succeed if the requestor has READ + * acess to the object) */ + perm = (verify_object_permission(this, s, iam_action1) || /* && */ + verify_object_permission(this, s, iam_action2)); + } + + if (! perm) { + return -EACCES; + } + + return 0; +} + +void RGWGetObjAttrs::pre_exec() +{ + rgw_bucket_object_pre_exec(s); +} + +void RGWGetObjAttrs::execute(optional_yield y) +{ + RGWGetObj::execute(y); +} /* RGWGetObjAttrs::execute */ + int RGWGetLC::verify_permission(optional_yield y) { auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false); @@ -6659,6 +6746,14 @@ try_sum_part_cksums(const DoutPrefixProvider *dpp, ++parts_ix; auto& part_cksum = part.second->get_cksum(); + if (! part_cksum) { + ldpp_dout_fmt(dpp, 0, + "ERROR: multipart part checksum not present (ix=={})", + parts_ix); + op_ret = -ERR_INVALID_REQUEST; + return op_ret; + } + ldpp_dout_fmt(dpp, 16, "INFO: {} iterate part: {} {} {}", __func__, parts_ix, part_cksum->type_string(), diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 9f747501729..dcf64c31572 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -12,6 +12,7 @@ #pragma once +#include <cstdint> #include <limits.h> #include <array> @@ -1111,6 +1112,7 @@ class RGWCreateBucket : public RGWOp { bool relaxed_region_enforcement = false; RGWCORSConfiguration cors_config; std::set<std::string> rmattr_names; + bufferlist in_data; virtual bool need_metadata_upload() const { return false; } @@ -1237,6 +1239,7 @@ protected: std::string multipart_upload_id; std::string multipart_part_str; int multipart_part_num = 0; + rgw::cksum::Type multipart_cksum_type{rgw::cksum::Type::none}; jspan_ptr multipart_trace; boost::optional<ceph::real_time> delete_at; @@ -1644,6 +1647,50 @@ public: uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; } }; +class RGWGetObjAttrs : public RGWGetObj { +protected: + std::string version_id; + std::string expected_bucket_owner; + std::optional<int> marker; + std::optional<int> max_parts; + uint16_t requested_attributes{0}; +#if 0 + /* used to decrypt attributes for objects stored with SSE-C */ + x-amz-server-side-encryption-customer-algorithm + x-amz-server-side-encryption-customer-key + x-amz-server-side-encryption-customer-key-MD5 +#endif +public: + + enum class ReqAttributes : uint16_t { + None = 0, + Etag, + Checksum, + ObjectParts, + StorageClass, + ObjectSize + }; + + static uint16_t as_flag(ReqAttributes attr) { + return 1 << (uint16_t(attr) ? uint16_t(attr) - 1 : 0); + } + + static uint16_t recognize_attrs(const std::string& hdr, uint16_t deflt = 0); + + RGWGetObjAttrs() : RGWGetObj() + { + RGWGetObj::get_data = false; // it's extra false + } + + int verify_permission(optional_yield y) override; + void pre_exec() override; + void execute(optional_yield y) override; + void send_response() override = 0; + const char* name() const override { return "get_obj_attrs"; } + RGWOpType get_type() override { return RGW_OP_GET_OBJ_ATTRS; } + uint32_t op_mask() override { return RGW_OP_TYPE_READ; } +}; /* RGWGetObjAttrs */ + class RGWGetLC : public RGWOp { protected: diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h index 49faea6403d..2c8225d289e 100644 --- a/src/rgw/rgw_op_type.h +++ b/src/rgw/rgw_op_type.h @@ -30,6 +30,7 @@ enum RGWOpType { RGW_OP_COPY_OBJ, RGW_OP_GET_ACLS, RGW_OP_PUT_ACLS, + RGW_OP_GET_OBJ_ATTRS, RGW_OP_GET_CORS, RGW_OP_PUT_CORS, RGW_OP_DELETE_CORS, diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h index aa33080af56..9111696453e 100644 --- a/src/rgw/rgw_rest.h +++ b/src/rgw/rgw_rest.h @@ -403,6 +403,17 @@ public: virtual std::string canonical_name() const override { return fmt::format("REST.{}.ACL", s->info.method); } }; +class RGWGetObjAttrs_ObjStore : public RGWGetObjAttrs { +public: + RGWGetObjAttrs_ObjStore() {} + ~RGWGetObjAttrs_ObjStore() override {} + + int get_params(optional_yield y) = 0; + /* not actually used */ + int send_response_data_error(optional_yield y) override { return 0; }; + int send_response_data(bufferlist& bl, off_t ofs, off_t len) override { return 0; }; +}; + class RGWGetLC_ObjStore : public RGWGetLC { public: RGWGetLC_ObjStore() {} diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 30ebe8e8965..9edb79d8fd0 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -9,6 +9,7 @@ #include <string_view> #include "common/ceph_crypto.h" +#include "common/dout.h" #include "common/split.h" #include "common/Formatter.h" #include "common/utf8.h" @@ -807,7 +808,6 @@ void RGWGetObjTags_ObjStore_S3::send_response_data(bufferlist& bl) } } - int RGWPutObjTags_ObjStore_S3::get_params(optional_yield y) { RGWXMLParser parser; @@ -2533,6 +2533,10 @@ int RGWCreateBucket_ObjStore_S3::get_params(optional_yield y) if ((op_ret < 0) && (op_ret != -ERR_LENGTH_REQUIRED)) return op_ret; + if (!driver->is_meta_master()) { + in_data.append(data); + } + if (data.length()) { RGWCreateBucketParser parser; @@ -3815,6 +3819,196 @@ void RGWPutACLs_ObjStore_S3::send_response() dump_start(s); } +int RGWGetObjAttrs_ObjStore_S3::get_params(optional_yield y) +{ + string err; + auto& env = s->info.env; + version_id = s->info.args.get("versionId"); + + auto hdr = env->get_optional("HTTP_X_AMZ_EXPECTED_BUCKET_OWNER"); + if (hdr) { + expected_bucket_owner = *hdr; + } + + hdr = env->get_optional("HTTP_X_AMZ_MAX_PARTS"); + if (hdr) { + max_parts = strict_strtol(hdr->c_str(), 10, &err); + if (!err.empty()) { + s->err.message = "Invalid value for MaxParts: " + err; + ldpp_dout(s, 10) << "Invalid value for MaxParts " << *hdr << ": " + << err << dendl; + return -ERR_INVALID_PART; + } + max_parts = std::min(*max_parts, 1000); + } + + hdr = env->get_optional("HTTP_X_AMZ_PART_NUMBER_MARKER"); + if (hdr) { + marker = strict_strtol(hdr->c_str(), 10, &err); + if (!err.empty()) { + s->err.message = "Invalid value for PartNumberMarker: " + err; + ldpp_dout(s, 10) << "Invalid value for PartNumberMarker " << *hdr << ": " + << err << dendl; + return -ERR_INVALID_PART; + } + } + + hdr = env->get_optional("HTTP_X_AMZ_OBJECT_ATTRIBUTES"); + if (hdr) { + requested_attributes = recognize_attrs(*hdr); + } + + /* XXX skipping SSE-C params for now */ + + return 0; +} /* RGWGetObjAttrs_ObjStore_S3::get_params(...) */ + +int RGWGetObjAttrs_ObjStore_S3::get_decrypt_filter( + std::unique_ptr<RGWGetObj_Filter> *filter, + RGWGetObj_Filter* cb, bufferlist* manifest_bl) +{ + // we aren't actually decrypting the data, but for objects encrypted with + // SSE-C we do need to verify that required headers are present and valid + // + // in the SSE-KMS and SSE-S3 cases, this unfortunately causes us to fetch + // decryption keys which we don't need :( + std::unique_ptr<BlockCrypt> block_crypt; // ignored + std::map<std::string, std::string> crypt_http_responses; // ignored + return rgw_s3_prepare_decrypt(s, s->yield, attrs, &block_crypt, + crypt_http_responses); +} + +void RGWGetObjAttrs_ObjStore_S3::send_response() +{ + if (op_ret) + set_req_state_err(s, op_ret); + dump_errno(s); + + if (op_ret == 0) { + version_id = s->object->get_instance(); + + // x-amz-delete-marker: DeleteMarker // not sure we can plausibly do this? + dump_last_modified(s, lastmod); + dump_header_if_nonempty(s, "x-amz-version-id", version_id); + // x-amz-request-charged: RequestCharged + } + + end_header(s, this, to_mime_type(s->format)); + dump_start(s); + + if (op_ret == 0) { + s->formatter->open_object_section("GetObjectAttributes"); + if (requested_attributes & as_flag(ReqAttributes::Etag)) { + if (lo_etag.empty()) { + auto iter = attrs.find(RGW_ATTR_ETAG); + if (iter != attrs.end()) { + lo_etag = iter->second.to_str(); + } + } + s->formatter->dump_string("ETag", lo_etag); + } + + if (requested_attributes & as_flag(ReqAttributes::Checksum)) { + s->formatter->open_object_section("Checksum"); + auto iter = attrs.find(RGW_ATTR_CKSUM); + if (iter != attrs.end()) { + try { + rgw::cksum::Cksum cksum; + auto bliter = iter->second.cbegin(); + cksum.decode(bliter); + if (multipart_parts_count && multipart_parts_count > 0) { + s->formatter->dump_string(cksum.element_name(), + fmt::format("{}-{}", cksum.to_armor(), *multipart_parts_count)); + } else { + s->formatter->dump_string(cksum.element_name(), cksum.to_armor()); + } + } catch (buffer::error& err) { + ldpp_dout(this, 0) + << "ERROR: could not decode stored cksum, caught buffer::error" << dendl; + } + } + s->formatter->close_section(); /* Checksum */ + } /* Checksum */ + + if (requested_attributes & as_flag(ReqAttributes::ObjectParts)) { + if (multipart_parts_count && multipart_parts_count > 0) { + + /* XXX the following was needed to see a manifest at list_parts()! */ + op_ret = s->object->load_obj_state(s, s->yield); + if (op_ret < 0) { + ldpp_dout_fmt(this, 0, + "ERROR: {} load_obj_state() failed ret={}", __func__, + op_ret); + } + + ldpp_dout_fmt(this, 16, + "{} attr flags={} parts_count={}", + __func__, requested_attributes, *multipart_parts_count); + + s->formatter->open_object_section("ObjectParts"); + + bool truncated = false; + int next_marker; + + using namespace rgw::sal; + + int ret = + s->object->list_parts( + this, s->cct, + max_parts ? *max_parts : 1000, + marker ? *marker : 0, + &next_marker, &truncated, + [&](const Object::Part& part) -> int { + s->formatter->open_object_section("Part"); + s->formatter->dump_int("PartNumber", part.part_number); + s->formatter->dump_unsigned("Size", part.part_size); + if (part.cksum.type != rgw::cksum::Type::none) { + s->formatter->dump_string(part.cksum.element_name(), part.cksum.to_armor()); + } + s->formatter->close_section(); /* Part */ + return 0; + }, s->yield); + + if (ret < 0) { + ldpp_dout_fmt(this, 0, + "ERROR: {} list-parts failed for {}", + __func__, s->object->get_name()); + } + /* AWS docs disagree on the name of this element */ + s->formatter->dump_int("PartsCount", *multipart_parts_count); + s->formatter->dump_int("TotalPartsCount", *multipart_parts_count); + s->formatter->dump_bool("IsTruncated", truncated); + if (max_parts) { + s->formatter->dump_int("MaxParts", *max_parts); + } + if(truncated) { + s->formatter->dump_int("NextPartNumberMarker", next_marker); + } + if (marker) { + s->formatter->dump_int("PartNumberMarker", *marker); + } + s->formatter->close_section(); + } /* multipart_parts_count positive */ + } /* ObjectParts */ + + if (requested_attributes & as_flag(ReqAttributes::ObjectSize)) { + s->formatter->dump_int("ObjectSize", s->obj_size); + } + + if (requested_attributes & as_flag(ReqAttributes::StorageClass)) { + auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS); + if (iter != attrs.end()) { + s->formatter->dump_string("StorageClass", iter->second.to_str()); + } else { + s->formatter->dump_string("StorageClass", "STANDARD"); + } + } + s->formatter->close_section(); + } /* op_ret == 0 */ + + rgw_flush_formatter_and_reset(s, s->formatter); +} /* RGWGetObjAttrs_ObjStore_S3::send_response */ + void RGWGetLC_ObjStore_S3::execute(optional_yield y) { config.set_ctx(s->cct); @@ -4794,6 +4988,7 @@ RGWOp *RGWHandler_REST_Bucket_S3::get_obj_op(bool get_data) const RGWOp *RGWHandler_REST_Bucket_S3::op_get() { + /* XXX maybe we could replace this with an indexing operation */ if (s->info.args.sub_resource_exists("encryption")) return nullptr; @@ -4990,6 +5185,8 @@ RGWOp *RGWHandler_REST_Obj_S3::op_get() return new RGWGetObjLayout_ObjStore_S3; } else if (is_tagging_op()) { return new RGWGetObjTags_ObjStore_S3; + } else if (is_attributes_op()) { + return new RGWGetObjAttrs_ObjStore_S3; } else if (is_obj_retention_op()) { return new RGWGetObjRetention_ObjStore_S3; } else if (is_obj_legal_hold_op()) { diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h index 50160d79a42..e8fdc69751c 100644 --- a/src/rgw/rgw_rest_s3.h +++ b/src/rgw/rgw_rest_s3.h @@ -374,6 +374,18 @@ public: int get_params(optional_yield y) override; }; +class RGWGetObjAttrs_ObjStore_S3 : public RGWGetObjAttrs_ObjStore { +public: + RGWGetObjAttrs_ObjStore_S3() {} + ~RGWGetObjAttrs_ObjStore_S3() override {} + + int get_params(optional_yield y) override; + int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter, + RGWGetObj_Filter* cb, + bufferlist* manifest_bl) override; + void send_response() override; +}; + class RGWGetLC_ObjStore_S3 : public RGWGetLC_ObjStore { protected: RGWLifecycleConfiguration_S3 config; @@ -701,6 +713,9 @@ protected: bool is_acl_op() const { return s->info.args.exists("acl"); } + bool is_attributes_op() const { + return s->info.args.exists("attributes"); + } bool is_cors_op() const { return s->info.args.exists("cors"); } @@ -759,6 +774,9 @@ protected: bool is_acl_op() const { return s->info.args.exists("acl"); } + bool is_attributes_op() const { + return s->info.args.exists("attributes"); + } bool is_tagging_op() const { return s->info.args.exists("tagging"); } diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h index e098c4decf7..4b94f74b851 100644 --- a/src/rgw/rgw_sal.h +++ b/src/rgw/rgw_sal.h @@ -15,6 +15,7 @@ #pragma once +#include <cstdint> #include <optional> #include <boost/intrusive_ptr.hpp> #include <boost/smart_ptr/intrusive_ref_counter.hpp> @@ -26,6 +27,7 @@ #include "rgw_notify_event_type.h" #include "rgw_req_context.h" #include "include/random.h" +#include "include/function2.hpp" // FIXME: following subclass dependencies #include "driver/rados/rgw_user.h" @@ -1169,6 +1171,9 @@ class Object { std::string* version_id, std::string* tag, std::string* etag, void (*progress_cb)(off_t, void *), void* progress_data, const DoutPrefixProvider* dpp, optional_yield y) = 0; + + /** return logging subsystem */ + virtual unsigned get_subsys() { return ceph_subsys_rgw; }; /** Get the ACL for this object */ virtual RGWAccessControlPolicy& get_acl(void) = 0; /** Set the ACL for this object */ @@ -1249,6 +1254,28 @@ class Object { /** Dump driver-specific object layout info in JSON */ virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) = 0; + /* A transfer data type describing metadata specific to one part of a + * completed multipart upload object, following the GetObjectAttributes + * response syntax for Object::Parts here: + * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObjectAttributes.html */ + class Part + { + public: + int part_number; + uint32_t part_size; + rgw::cksum::Cksum cksum; + }; /* Part */ + + /* callback function/object used by list_parts */ + using list_parts_each_t = + const fu2::unique_function<int(const Part&) const>; + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) = 0; + /** Get the cached attributes for this object */ virtual Attrs& get_attrs(void) = 0; /** Get the (const) cached attributes for this object */ @@ -1447,7 +1474,7 @@ public: virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) = 0; /** List all the parts of this upload, filling the parts cache */ virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, - int num_parts, int marker, + int max_parts, int marker, int* next_marker, bool* truncated, optional_yield y, bool assume_unsorted = false) = 0; /** Abort this upload */ @@ -1751,8 +1778,6 @@ class Zone { virtual bool is_writeable() = 0; /** Get the URL for the endpoint for redirecting to this zone */ virtual bool get_redirect_endpoint(std::string* endpoint) = 0; - /** Check to see if the given API is supported in this zone */ - virtual bool has_zonegroup_api(const std::string& api) const = 0; /** Get the current period ID for this zone */ virtual const std::string& get_current_period_id() = 0; /** Get thes system access key for this zone */ diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc index 0e4f95846d1..02fd7a49cda 100644 --- a/src/rgw/rgw_sal_dbstore.cc +++ b/src/rgw/rgw_sal_dbstore.cc @@ -458,14 +458,6 @@ namespace rgw::sal { return false; } - bool DBZone::has_zonegroup_api(const std::string& api) const - { - if (api == "default") - return true; - - return false; - } - const std::string& DBZone::get_current_period_id() { return current_period->get_id(); @@ -496,6 +488,14 @@ namespace rgw::sal { return std::make_unique<DBLuaManager>(this); } + int DBObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) + { + return -EOPNOTSUPP; + } + int DBObject::load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh) { RGWObjState* astate; diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h index b54249df031..4df10d1dce1 100644 --- a/src/rgw/rgw_sal_dbstore.h +++ b/src/rgw/rgw_sal_dbstore.h @@ -303,7 +303,6 @@ protected: virtual const std::string& get_name() const override; virtual bool is_writeable() override; virtual bool get_redirect_endpoint(std::string* endpoint) override; - virtual bool has_zonegroup_api(const std::string& api) const override; virtual const std::string& get_current_period_id() override; virtual const RGWAccessKey& get_system_key() override; virtual const std::string& get_realm_name() override; @@ -529,6 +528,7 @@ protected: DBObject(DBObject& _o) = default; + virtual unsigned get_subsys() { return ceph_subsys_rgw_dbstore; }; virtual int delete_object(const DoutPrefixProvider* dpp, optional_yield y, uint32_t flags, @@ -554,6 +554,13 @@ protected: virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; } virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y, uint32_t flags) override; + + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int load_obj_state(const DoutPrefixProvider* dpp, optional_yield y, bool follow_olh = true) override; virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override; virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override; diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc index 733bfa39ee2..15da580988e 100644 --- a/src/rgw/rgw_sal_filter.cc +++ b/src/rgw/rgw_sal_filter.cc @@ -1046,6 +1046,17 @@ RGWAccessControlPolicy& FilterObject::get_acl() return next->get_acl(); } +int FilterObject::list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) +{ + return next->list_parts(dpp, cct, max_parts, marker, next_marker, + truncated, + sal::Object::list_parts_each_t(each_func), + y); +} + int FilterObject::load_obj_state(const DoutPrefixProvider *dpp, optional_yield y, bool follow_olh) { return next->load_obj_state(dpp, y, follow_olh); diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h index 43a440e8b10..947ce9d4bf5 100644 --- a/src/rgw/rgw_sal_filter.h +++ b/src/rgw/rgw_sal_filter.h @@ -108,9 +108,6 @@ public: virtual bool get_redirect_endpoint(std::string* endpoint) override { return next->get_redirect_endpoint(endpoint); } - virtual bool has_zonegroup_api(const std::string& api) const override { - return next->has_zonegroup_api(api); - } virtual const std::string& get_current_period_id() override { return next->get_current_period_id(); } @@ -781,6 +778,12 @@ public: virtual bool empty() const override { return next->empty(); } virtual const std::string &get_name() const override { return next->get_name(); } + /** If multipart, enumerate (a range [marker..marker+[min(max_parts, parts_count-1)] of) parts of the object */ + virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct, + int max_parts, int marker, int* next_marker, + bool* truncated, list_parts_each_t each_func, + optional_yield y) override; + virtual int load_obj_state(const DoutPrefixProvider *dpp, optional_yield y, bool follow_olh = true) override; virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc index 61360135dd9..97d81550058 100644 --- a/src/rgw/services/svc_zone.cc +++ b/src/rgw/services/svc_zone.cc @@ -657,18 +657,6 @@ const string& RGWSI_Zone::get_current_period_id() const return current_period->get_id(); } -bool RGWSI_Zone::has_zonegroup_api(const std::string& api) const -{ - if (!current_period->get_id().empty()) { - const auto& zonegroups_by_api = current_period->get_map().zonegroups_by_api; - if (zonegroups_by_api.find(api) != zonegroups_by_api.end()) - return true; - } else if (zonegroup->api_name == api) { - return true; - } - return false; -} - bool RGWSI_Zone::zone_is_writeable() { return writeable_zone && !get_zone().is_read_only(); diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h index 1b3f49587a3..719546eb8db 100644 --- a/src/rgw/services/svc_zone.h +++ b/src/rgw/services/svc_zone.h @@ -96,7 +96,6 @@ public: uint32_t get_zone_short_id() const; const std::string& get_current_period_id() const; - bool has_zonegroup_api(const std::string& api) const; bool zone_is_writeable(); bool zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const; diff --git a/src/script/run-make.sh b/src/script/run-make.sh index 52d43d3a171..23724028fe6 100755 --- a/src/script/run-make.sh +++ b/src/script/run-make.sh @@ -29,6 +29,7 @@ function clean_up_after_myself() { function detect_ceph_dev_pkgs() { local boost_root=/opt/ceph + local cmake_opts="" if test -f $boost_root/include/boost/config.hpp; then cmake_opts+=" -DWITH_SYSTEM_BOOST=ON -DBOOST_ROOT=$boost_root" else diff --git a/src/test/ObjectMap/KeyValueDBMemory.cc b/src/test/ObjectMap/KeyValueDBMemory.cc index 234e963397e..cfe25930d6a 100644 --- a/src/test/ObjectMap/KeyValueDBMemory.cc +++ b/src/test/ObjectMap/KeyValueDBMemory.cc @@ -132,12 +132,26 @@ public: return ""; } + string_view key_as_sv() override { + if (valid()) + return (*it).first.second; + else + return ""; + } + pair<string,string> raw_key() override { if (valid()) return (*it).first; else return make_pair("", ""); } + + pair<string_view,string_view> raw_key_as_sv() override { + if (valid()) + return (*it).first; + else + return make_pair("", ""); + } bool raw_key_is_prefixed(const string &prefix) override { return prefix == (*it).first.first; @@ -150,6 +164,13 @@ public: return bufferlist(); } + std::string_view value_as_sv() override { + if (valid()) + return std::string_view{it->second.c_str(), it->second.length()}; + else + return std::string_view(); + } + int status() override { return 0; } diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 8b1f7435c87..7874411e0ff 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -157,7 +157,10 @@ struct btree_test_base : }).safe_then([this] { return seastar::do_with( cache->create_transaction( - Transaction::src_t::MUTATE, "test_set_up_fut", false), + Transaction::src_t::MUTATE, + "test_set_up_fut", + CACHE_HINT_TOUCH, + false), [this](auto &ref_t) { return with_trans_intr(*ref_t, [&](auto &t) { cache->init(); @@ -236,7 +239,10 @@ struct lba_btree_test : btree_test_base { template <typename F> auto lba_btree_update(F &&f) { auto tref = cache->create_transaction( - Transaction::src_t::MUTATE, "test_btree_update", false); + Transaction::src_t::MUTATE, + "test_btree_update", + CACHE_HINT_TOUCH, + false); auto &t = *tref; with_trans_intr( t, @@ -281,7 +287,10 @@ struct lba_btree_test : btree_test_base { template <typename F> auto lba_btree_read(F &&f) { auto t = cache->create_transaction( - Transaction::src_t::READ, "test_btree_read", false); + Transaction::src_t::READ, + "test_btree_read", + CACHE_HINT_TOUCH, + false); return with_trans_intr( *t, [this, f=std::forward<F>(f)](auto &t) mutable { @@ -429,7 +438,10 @@ struct btree_lba_manager_test : btree_test_base { auto create_transaction(bool create_fake_extent=true) { auto t = test_transaction_t{ cache->create_transaction( - Transaction::src_t::MUTATE, "test_mutate_lba", false), + Transaction::src_t::MUTATE, + "test_mutate_lba", + CACHE_HINT_TOUCH, + false), test_lba_mappings }; if (create_fake_extent) { @@ -445,7 +457,10 @@ struct btree_lba_manager_test : btree_test_base { auto create_weak_transaction() { auto t = test_transaction_t{ cache->create_transaction( - Transaction::src_t::READ, "test_read_weak", true), + Transaction::src_t::READ, + "test_read_weak", + CACHE_HINT_TOUCH, + true), test_lba_mappings }; return t; diff --git a/src/test/crimson/seastore/test_seastore_cache.cc b/src/test/crimson/seastore/test_seastore_cache.cc index 6e24f436b98..fa774886139 100644 --- a/src/test/crimson/seastore/test_seastore_cache.cc +++ b/src/test/crimson/seastore/test_seastore_cache.cc @@ -87,7 +87,10 @@ struct cache_test_t : public seastar_test_suite_t { auto get_transaction() { return cache->create_transaction( - Transaction::src_t::MUTATE, "test_cache", false); + Transaction::src_t::MUTATE, + "test_cache", + CACHE_HINT_TOUCH, + false); } template <typename T, typename... Args> diff --git a/src/test/crimson/test_backfill.cc b/src/test/crimson/test_backfill.cc index 7e058c80ed6..e0fc5821d08 100644 --- a/src/test/crimson/test_backfill.cc +++ b/src/test/crimson/test_backfill.cc @@ -119,6 +119,11 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener { events_to_dispatch.emplace_back(event.intrusive_from_this()); } + template <class EventT> + void schedule_event_immediate(const EventT& event) { + events_to_dispatch.emplace_front(event.intrusive_from_this()); + } + // BackfillListener { void request_replica_scan( const pg_shard_t& target, @@ -188,12 +193,11 @@ public: struct PGFacade; void cancel() { - events_to_dispatch.clear(); - schedule_event(crimson::osd::BackfillState::CancelBackfill{}); + schedule_event_immediate(crimson::osd::BackfillState::CancelBackfill{}); } void resume() { - schedule_event(crimson::osd::BackfillState::Triggered{}); + schedule_event_immediate(crimson::osd::BackfillState::Triggered{}); } }; @@ -274,6 +278,9 @@ struct BackfillFixture::PGFacade : public crimson::osd::BackfillState::PGFacade return backfill_source.projected_log; } + std::ostream &print(std::ostream &out) const override { + return out << "FakePGFacade"; + } }; BackfillFixture::BackfillFixture( @@ -452,7 +459,69 @@ TEST(backfill, two_empty_replicas) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } -TEST(backfill, cancel_resume) +TEST(backfill, cancel_resume_middle_of_primaryscan) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_replicascan1) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_replicascan2) { const auto reference_store = FakeStore{ { { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, @@ -469,12 +538,43 @@ TEST(backfill, cancel_resume) EXPECT_CALL(cluster_fixture, backfilled); cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.cancel(); cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); cluster_fixture.resume(); cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_push1) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); @@ -483,7 +583,7 @@ TEST(backfill, cancel_resume) EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); } -TEST(backfill, cancel_resume_middle_of_scan) +TEST(backfill, cancel_resume_middle_of_push2) { const auto reference_store = FakeStore{ { { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, @@ -501,14 +601,46 @@ TEST(backfill, cancel_resume_middle_of_scan) EXPECT_CALL(cluster_fixture, backfilled); cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.cancel(); cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); cluster_fixture.resume(); cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.next_till_done(); + + EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); +} + +TEST(backfill, cancel_resume_middle_of_push3) +{ + const auto reference_store = FakeStore{ { + { "1:00058bcc:::rbd_data.1018ac3e755.00000000000000d5:head", {10, 234} }, + { "1:00ed7f8e:::rbd_data.1018ac3e755.00000000000000af:head", {10, 196} }, + { "1:01483aea:::rbd_data.1018ac3e755.0000000000000095:head", {10, 169} }, + }}; + auto cluster_fixture = BackfillFixtureBuilder::add_source( + reference_store.objs + ).add_target( + { /* nothing 1 */ } + ).add_target( + { /* nothing 2 */ } + ).get_result(); + + EXPECT_CALL(cluster_fixture, backfilled); + cluster_fixture.next_round2<crimson::osd::BackfillState::PrimaryScanned>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ReplicaScanned>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.cancel(); + cluster_fixture.next_round2<crimson::osd::BackfillState::CancelBackfill>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); cluster_fixture.next_round2<crimson::osd::BackfillState::ObjectPushed>(); + cluster_fixture.resume(); + cluster_fixture.next_round2<crimson::osd::BackfillState::Triggered>(); + cluster_fixture.next_round2<crimson::osd::BackfillState::RequestDone>(); cluster_fixture.next_till_done(); EXPECT_TRUE(cluster_fixture.all_stores_look_like(reference_store)); diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc index 68587fe87d1..7fb90bdd38e 100644 --- a/src/test/librados/aio.cc +++ b/src/test/librados/aio.cc @@ -1722,3 +1722,59 @@ TEST(LibRadosAioEC, MultiWrite) { rados_aio_release(my_completion2); rados_aio_release(my_completion3); } + +TEST(LibRadosAio, CancelBeforeSubmit) { + AioTestData test_data; + ASSERT_EQ("", test_data.init()); + + rados_completion_t completion; + ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion)); + + ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion)); + rados_aio_release(completion); +} + +TEST(LibRadosAio, CancelBeforeComplete) { + AioTestData test_data; + ASSERT_EQ("", test_data.init()); + + // cancellation tests are racy, so retry if completion beats the cancellation + int ret = 0; + int tries = 10; + do { + rados_completion_t completion; + ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion)); + char buf[128]; + ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent", + completion, buf, sizeof(buf), 0)); + + ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion)); + { + TestAlarm alarm; + ASSERT_EQ(0, rados_aio_wait_for_complete(completion)); + } + ret = rados_aio_get_return_value(completion); + rados_aio_release(completion); + } while (ret == -ENOENT && --tries); + + ASSERT_EQ(-ECANCELED, ret); +} + +TEST(LibRadosAio, CancelAfterComplete) { + AioTestData test_data; + rados_completion_t completion; + ASSERT_EQ("", test_data.init()); + + ASSERT_EQ(0, rados_aio_create_completion2(nullptr, nullptr, &completion)); + char buf[128]; + ASSERT_EQ(0, rados_aio_read(test_data.m_ioctx, "nonexistent", + completion, buf, sizeof(buf), 0)); + + { + TestAlarm alarm; + ASSERT_EQ(0, rados_aio_wait_for_complete(completion)); + } + ASSERT_EQ(0, rados_aio_cancel(test_data.m_ioctx, completion)); + ASSERT_EQ(-ENOENT, rados_aio_get_return_value(completion)); + rados_aio_release(completion); +} diff --git a/src/test/librados/aio_cxx.cc b/src/test/librados/aio_cxx.cc index a70af050d70..5e35869b5c2 100644 --- a/src/test/librados/aio_cxx.cc +++ b/src/test/librados/aio_cxx.cc @@ -2467,3 +2467,92 @@ TEST(LibRadosAio, MultiReads) { ASSERT_EQ(0, memcmp(buf, bl.c_str(), sizeof(buf))); } } + +// cancellation test fixture for global setup/teardown +// parameterized to test both IoCtx::aio_cancel() and AioCompletion::cancel() +class Cancel : public ::testing::TestWithParam<bool> { + static constexpr auto pool_prefix = "ceph_test_rados_api_pp"; + static Rados rados; + static std::string pool_name; + protected: + static IoCtx ioctx; + public: + static void SetUpTestCase() { + pool_name = get_temp_pool_name(pool_prefix); + ASSERT_EQ("", create_one_pool_pp(pool_name, rados)); + ASSERT_EQ(0, rados.ioctx_create(pool_name.c_str(), ioctx)); + } + static void TearDownTestCase() { + destroy_one_pool_pp(pool_name, rados); + } +}; +Rados Cancel::rados; +std::string Cancel::pool_name; +IoCtx Cancel::ioctx; + +TEST_P(Cancel, BeforeSubmit) +{ + const bool use_completion = GetParam(); + + auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()}; + if (use_completion) { + ASSERT_EQ(0, c->cancel()); + } else { + ASSERT_EQ(0, ioctx.aio_cancel(c.get())); + } +} + +TEST_P(Cancel, BeforeComplete) +{ + const bool use_completion = GetParam(); + + // cancellation tests are racy, so retry if completion beats the cancellation + int ret = 0; + int tries = 10; + do { + auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()}; + ObjectReadOperation op; + op.assert_exists(); + ioctx.aio_operate("nonexistent", c.get(), &op, nullptr); + + if (use_completion) { + EXPECT_EQ(0, c->cancel()); + } else { + EXPECT_EQ(0, ioctx.aio_cancel(c.get())); + } + { + TestAlarm alarm; + ASSERT_EQ(0, c->wait_for_complete()); + } + ret = c->get_return_value(); + } while (ret == -ENOENT && --tries); + + EXPECT_EQ(-ECANCELED, ret); +} + +TEST_P(Cancel, AfterComplete) +{ + const bool use_completion = GetParam(); + + auto c = std::unique_ptr<AioCompletion>{Rados::aio_create_completion()}; + ObjectReadOperation op; + op.assert_exists(); + ioctx.aio_operate("nonexistent", c.get(), &op, nullptr); + { + TestAlarm alarm; + ASSERT_EQ(0, c->wait_for_complete()); + } + if (use_completion) { + EXPECT_EQ(0, c->cancel()); + } else { + EXPECT_EQ(0, ioctx.aio_cancel(c.get())); + } + EXPECT_EQ(-ENOENT, c->get_return_value()); +} + +std::string cancel_test_name(const testing::TestParamInfo<Cancel::ParamType>& info) +{ + return info.param ? "cancel" : "aio_cancel"; +} + +INSTANTIATE_TEST_SUITE_P(LibRadosAio, Cancel, testing::Bool(), cancel_test_name); diff --git a/src/test/librados/asio.cc b/src/test/librados/asio.cc index 01ebb957150..500f36508a7 100644 --- a/src/test/librados/asio.cc +++ b/src/test/librados/asio.cc @@ -21,10 +21,14 @@ #include <boost/range/begin.hpp> #include <boost/range/end.hpp> +#include <boost/asio/bind_cancellation_slot.hpp> +#include <boost/asio/cancellation_signal.hpp> #include <boost/asio/io_context.hpp> #include <boost/asio/spawn.hpp> #include <boost/asio/use_future.hpp> +#include <optional> + #define dout_subsys ceph_subsys_rados #define dout_context g_ceph_context @@ -78,6 +82,15 @@ void rethrow(std::exception_ptr eptr) { if (eptr) std::rethrow_exception(eptr); } +auto capture(std::optional<error_code>& out) { + return [&out] (error_code ec, ...) { out = ec; }; +} + +auto capture(boost::asio::cancellation_signal& signal, + std::optional<error_code>& out) { + return boost::asio::bind_cancellation_slot(signal.slot(), capture(out)); +} + TEST_F(AsioRados, AsyncReadCallback) { boost::asio::io_context service; @@ -385,6 +398,130 @@ TEST_F(AsioRados, AsyncWriteOperationYield) service.run(); } +// FIXME: this crashes on windows with: +// Thread 1 received signal SIGILL, Illegal instruction. +#ifndef _WIN32 + +TEST_F(AsioRados, AsyncReadOperationCancelTerminal) +{ + // cancellation tests are racy, so retry if completion beats the cancellation + boost::system::error_code ec; + int tries = 10; + do { + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> result; + + librados::ObjectReadOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, result)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(result); + + signal.emit(boost::asio::cancellation_type::terminal); + + service.run(); + ASSERT_TRUE(result); + ec = *result; + + signal.emit(boost::asio::cancellation_type::all); // noop + } while (ec == std::errc::no_such_file_or_directory && --tries); + + EXPECT_EQ(ec, boost::asio::error::operation_aborted); +} + +TEST_F(AsioRados, AsyncReadOperationCancelTotal) +{ + // cancellation tests are racy, so retry if completion beats the cancellation + boost::system::error_code ec; + int tries = 10; + do { + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> result; + + librados::ObjectReadOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, result)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(result); + + signal.emit(boost::asio::cancellation_type::total); + + service.run(); + ASSERT_TRUE(result); + ec = *result; + + signal.emit(boost::asio::cancellation_type::all); // noop + } while (ec == std::errc::no_such_file_or_directory && --tries); + + EXPECT_EQ(ec, boost::asio::error::operation_aborted); +} + +TEST_F(AsioRados, AsyncWriteOperationCancelTerminal) +{ + // cancellation tests are racy, so retry if completion beats the cancellation + boost::system::error_code ec; + int tries = 10; + do { + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> result; + + librados::ObjectWriteOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, result)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(result); + + signal.emit(boost::asio::cancellation_type::terminal); + + service.run(); + ASSERT_TRUE(result); + ec = *result; + + signal.emit(boost::asio::cancellation_type::all); // noop + } while (ec == std::errc::no_such_file_or_directory && --tries); + + EXPECT_EQ(ec, boost::asio::error::operation_aborted); +} + +TEST_F(AsioRados, AsyncWriteOperationCancelTotal) +{ + boost::asio::io_context service; + boost::asio::cancellation_signal signal; + std::optional<error_code> ec; + + librados::ObjectWriteOperation op; + op.assert_exists(); + librados::async_operate(service, io, "noexist", &op, 0, nullptr, + capture(signal, ec)); + + service.poll(); + EXPECT_FALSE(service.stopped()); + EXPECT_FALSE(ec); + + // noop, write only supports terminal + signal.emit(boost::asio::cancellation_type::total); + + service.run(); + ASSERT_TRUE(ec); + EXPECT_EQ(ec, std::errc::no_such_file_or_directory); + + signal.emit(boost::asio::cancellation_type::all); // noop +} + +#endif // not _WIN32 + int main(int argc, char **argv) { auto args = argv_to_vec(argc, argv); diff --git a/src/test/objectstore/ObjectStoreImitator.h b/src/test/objectstore/ObjectStoreImitator.h index d71d7f2fe58..875f9041b83 100644 --- a/src/test/objectstore/ObjectStoreImitator.h +++ b/src/test/objectstore/ObjectStoreImitator.h @@ -347,6 +347,16 @@ public: ) override { return {}; } + + int omap_iterate(CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + /// [in] where the iterator should point to at the beginning + omap_iter_seek_t start_from, + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override { + return 0; + } + void set_fsid(uuid_d u) override {} uuid_d get_fsid() override { return {}; } uint64_t estimate_objects_overhead(uint64_t num_objects) override { diff --git a/src/test/objectstore/allocsim/ops_replayer.cc b/src/test/objectstore/allocsim/ops_replayer.cc index fd947f5c454..c5908d9f576 100644 --- a/src/test/objectstore/allocsim/ops_replayer.cc +++ b/src/test/objectstore/allocsim/ops_replayer.cc @@ -1,4 +1,5 @@ #include <algorithm> +#include <functional> #include <boost/program_options/value_semantic.hpp> #include <cassert> #include <cctype> @@ -13,26 +14,46 @@ #include <fstream> #include <filesystem> #include <mutex> -#include "include/rados/buffer_fwd.h" -#include "include/rados/librados.hpp" #include <atomic> -#include <fmt/format.h> #include <map> #include <memory> #include <random> #include <string> #include <iostream> #include <vector> +#include <format> + +#include <fmt/format.h> #include <boost/program_options/variables_map.hpp> #include <boost/program_options/parsers.hpp> +#include "include/rados/buffer_fwd.h" +#include "include/rados/librados.hpp" + namespace po = boost::program_options; using namespace std; using namespace ceph; +namespace settings { + +// Returns a function which restricts a value to a specified range by throwing if it is not in range: +// (Note: std::clamp() does not throw.) +auto clamp_or_throw(auto min, auto max) +{ + return [=](auto& x) { + if(std::less<>{}(x, min) or std::greater<>{}(x, max)) { + throw std::out_of_range(fmt::format("value expected between {} and {}, but got {}", min, max, x)); + } + + return x; + }; +} + +} // namespace settings + // compare shared_ptr<string> struct StringPtrCompare { @@ -338,8 +359,8 @@ int main(int argc, char** argv) { // options uint64_t io_depth = 8; - uint64_t nparser_threads = 16; - uint64_t nworker_threads = 16; + int nparser_threads = 16; + int nworker_threads = 16; string file("input.txt"); string ceph_conf_path("./ceph.conf"); string pool("test_pool"); @@ -351,8 +372,8 @@ int main(int argc, char** argv) { ("input-files,i", po::value<vector<string>>()->multitoken(), "List of input files (output of op_scraper.py). Multiple files will be merged and sorted by time order") ("ceph-conf", po::value<string>(&ceph_conf_path)->default_value("ceph.conf"), "Path to ceph conf") ("io-depth", po::value<uint64_t>(&io_depth)->default_value(64), "I/O depth") - ("parser-threads", po::value<uint64_t>(&nparser_threads)->default_value(16), "Number of parser threads") - ("worker-threads", po::value<uint64_t>(&nworker_threads)->default_value(16), "Number of I/O worker threads") + ("parser-threads", po::value<int>(&nparser_threads)->default_value(16)->notifier(settings::clamp_or_throw(1, 256)), "Number of parser threads") + ("worker-threads", po::value<int>(&nworker_threads)->default_value(16)->notifier(settings::clamp_or_throw(1, 256)), "Number of I/O worker threads") ("pool", po::value<string>(&pool)->default_value("test_pool"), "Pool to use for I/O") ("skip-do-ops", po::bool_switch(&skip_do_ops)->default_value(false), "Skip doing operations") ; diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt index f2d1471e22e..798558ebbe0 100644 --- a/src/test/osd/CMakeLists.txt +++ b/src/test/osd/CMakeLists.txt @@ -22,7 +22,7 @@ install(TARGETS add_executable(ceph_test_rados_io_sequence ${CMAKE_CURRENT_SOURCE_DIR}/ceph_test_rados_io_sequence.cc) target_link_libraries(ceph_test_rados_io_sequence - librados global object_io_exerciser) + librados global object_io_exerciser json_structures) install(TARGETS ceph_test_rados_io_sequence DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/src/test/osd/ceph_test_rados_io_sequence.cc b/src/test/osd/ceph_test_rados_io_sequence.cc index 4a768a016e2..96808ea37e5 100644 --- a/src/test/osd/ceph_test_rados_io_sequence.cc +++ b/src/test/osd/ceph_test_rados_io_sequence.cc @@ -1,83 +1,104 @@ #include "ceph_test_rados_io_sequence.h" +#include <boost/asio/io_context.hpp> #include <iostream> #include <vector> -#include <boost/asio/io_context.hpp> - -#include "include/random.h" - -#include "librados/librados_asio.h" -#include "common/ceph_argparse.h" -#include "include/interval_set.h" -#include "global/global_init.h" -#include "global/global_context.h" +#include "common/Formatter.h" #include "common/Thread.h" +#include "common/ceph_argparse.h" +#include "common/ceph_json.h" #include "common/debug.h" #include "common/dout.h" #include "common/split.h" #include "common/strtol.h" // for strict_iecstrtoll() +#include "common/ceph_json.h" +#include "common/Formatter.h" #include "common/io_exerciser/DataGenerator.h" +#include "common/io_exerciser/EcIoSequence.h" +#include "common/io_exerciser/IoOp.h" +#include "common/io_exerciser/IoSequence.h" #include "common/io_exerciser/Model.h" #include "common/io_exerciser/ObjectModel.h" #include "common/io_exerciser/RadosIo.h" -#include "common/io_exerciser/IoOp.h" -#include "common/io_exerciser/IoSequence.h" +#include "common/json/BalancerStructures.h" +#include "common/json/ConfigStructures.h" +#include "common/json/OSDStructures.h" +#include "fmt/format.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/interval_set.h" +#include "include/random.h" +#include "json_spirit/json_spirit.h" +#include "librados/librados_asio.h" #define dout_subsys ceph_subsys_rados #define dout_context g_ceph_context +using OpType = ceph::io_exerciser::OpType; + +using DoneOp = ceph::io_exerciser::DoneOp; +using BarrierOp = ceph::io_exerciser::BarrierOp; +using CreateOp = ceph::io_exerciser::CreateOp; +using RemoveOp = ceph::io_exerciser::RemoveOp; +using SingleReadOp = ceph::io_exerciser::SingleReadOp; +using DoubleReadOp = ceph::io_exerciser::DoubleReadOp; +using TripleReadOp = ceph::io_exerciser::TripleReadOp; +using SingleWriteOp = ceph::io_exerciser::SingleWriteOp; +using DoubleWriteOp = ceph::io_exerciser::DoubleWriteOp; +using TripleWriteOp = ceph::io_exerciser::TripleWriteOp; +using SingleFailedWriteOp = ceph::io_exerciser::SingleFailedWriteOp; +using DoubleFailedWriteOp = ceph::io_exerciser::DoubleFailedWriteOp; +using TripleFailedWriteOp = ceph::io_exerciser::TripleFailedWriteOp; + namespace { - struct Size {}; - void validate(boost::any& v, const std::vector<std::string>& values, - Size *target_type, int) { - po::validators::check_first_occurrence(v); - const std::string &s = po::validators::get_single_string(values); - - std::string parse_error; - uint64_t size = strict_iecstrtoll(s, &parse_error); - if (!parse_error.empty()) { - throw po::validation_error(po::validation_error::invalid_option_value); - } - v = boost::any(size); - } - - struct Pair {}; - void validate(boost::any& v, const std::vector<std::string>& values, - Pair *target_type, int) { - po::validators::check_first_occurrence(v); - const std::string &s = po::validators::get_single_string(values); - auto part = ceph::split(s).begin(); - std::string parse_error; - int first = strict_iecstrtoll(*part++, &parse_error); - int second = strict_iecstrtoll(*part, &parse_error); - if (!parse_error.empty()) { - throw po::validation_error(po::validation_error::invalid_option_value); - } - v = boost::any(std::pair<int,int>{first,second}); - } - - struct PluginString {}; - void validate(boost::any& v, const std::vector<std::string>& values, - PluginString *target_type, int) { - po::validators::check_first_occurrence(v); - const std::string &s = po::validators::get_single_string(values); - - const std::string_view* pluginIt = std::find( - ceph::io_sequence::tester::pluginChoices.begin(), - ceph::io_sequence::tester::pluginChoices.end(), - s - ); - if(ceph::io_sequence::tester::pluginChoices.end() == pluginIt) - { - throw po::validation_error(po::validation_error::invalid_option_value); - } +struct Size {}; +void validate(boost::any& v, const std::vector<std::string>& values, + Size* target_type, int) { + po::validators::check_first_occurrence(v); + const std::string& s = po::validators::get_single_string(values); - v = boost::any(*pluginIt); + std::string parse_error; + uint64_t size = strict_iecstrtoll(s, &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); } + v = boost::any(size); +} + +struct Pair {}; +void validate(boost::any& v, const std::vector<std::string>& values, + Pair* target_type, int) { + po::validators::check_first_occurrence(v); + const std::string& s = po::validators::get_single_string(values); + auto part = ceph::split(s).begin(); + std::string parse_error; + int first = strict_iecstrtoll(*part++, &parse_error); + int second = strict_iecstrtoll(*part, &parse_error); + if (!parse_error.empty()) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + v = boost::any(std::pair<int, int>{first, second}); +} + +struct PluginString {}; +void validate(boost::any& v, const std::vector<std::string>& values, + PluginString* target_type, int) { + po::validators::check_first_occurrence(v); + const std::string& s = po::validators::get_single_string(values); + + const std::string_view* pluginIt = + std::find(ceph::io_sequence::tester::pluginChoices.begin(), + ceph::io_sequence::tester::pluginChoices.end(), s); + if (ceph::io_sequence::tester::pluginChoices.end() == pluginIt) { + throw po::validation_error(po::validation_error::invalid_option_value); + } + + v = boost::any(*pluginIt); +} - constexpr std::string_view usage[] = { +constexpr std::string_view usage[] = { "Basic usage:", "", "ceph_test_rados_io_sequence", @@ -119,103 +140,99 @@ namespace { "\t are specified with unit of blocksize. Supported commands:", "\t\t create <len>", "\t\t remove", - "\t\t read|write <off> <len>", - "\t\t read2|write2 <off> <len> <off> <len>", - "\t\t read3|write3 <off> <len> <off> <len> <off> <len>", - "\t\t done" - }; - - po::options_description get_options_description() - { - po::options_description desc("ceph_test_rados_io options"); - desc.add_options() - ("help,h", - "show help message") - ("listsequence,l", - "show list of sequences") - ("dryrun,d", - "test sequence, do not issue any I/O") - ("verbose", - "more verbose output during test") - ("sequence,s", po::value<int>(), - "test specified sequence") - ("seed", po::value<int>(), - "seed for whole test") - ("seqseed", po::value<int>(), - "seed for sequence") - ("blocksize,b", po::value<Size>(), - "block size (default 2048)") - ("chunksize,c", po::value<Size>(), - "chunk size (default 4096)") - ("pool,p", po::value<std::string>(), - "pool name") - ("object,o", po::value<std::string>()->default_value("test"), - "object name") - ("km", po::value<Pair>(), - "k,m EC pool profile (default 2,2)") - ("plugin", po::value<PluginString>(), - "EC plugin (isa or jerasure)") - ("objectsize", po::value<Pair>(), - "min,max object size in blocks (default 1,32)") - ("threads,t", po::value<int>(), - "number of threads of I/O per object (default 1)") - ("parallel,p", po::value<int>()->default_value(1), - "number of objects to exercise in parallel") - ("interactive", - "interactive mode, execute IO commands from stdin"); - - return desc; - } - - int parse_io_seq_options( - po::variables_map& vm, - int argc, - char** argv) - { - std::vector<std::string> unrecognized_options; - try { - po::options_description desc = get_options_description(); - - auto parsed = po::command_line_parser(argc, argv) - .options(desc) - .allow_unregistered() - .run(); - po::store(parsed, vm); - po::notify(vm); - unrecognized_options = po::collect_unrecognized(parsed.options, - po::include_positional); - - if (!unrecognized_options.empty()) - { - std::stringstream ss; - ss << "Unrecognised command options supplied: "; - while (unrecognized_options.size() > 1) - { - ss << unrecognized_options.back().c_str() << ", "; - unrecognized_options.pop_back(); - } - ss << unrecognized_options.back(); - dout(0) << ss.str() << dendl; - return 1; + "\t\t read|write|failedwrite <off> <len>", + "\t\t read2|write2|failedwrite2 <off> <len> <off> <len>", + "\t\t read3|write3|failedwrite3 <off> <len> <off> <len> <off> <len>", + "\t\t injecterror <type> <shard> <good_count> <fail_count>", + "\t\t clearinject <type> <shard>", + "\t\t done"}; + +po::options_description get_options_description() { + po::options_description desc("ceph_test_rados_io options"); + desc.add_options()("help,h", "show help message")("listsequence,l", + "show list of sequences")( + "dryrun,d", "test sequence, do not issue any I/O")( + "verbose", "more verbose output during test")( + "sequence,s", po::value<int>(), "test specified sequence")( + "seed", po::value<int>(), "seed for whole test")( + "seqseed", po::value<int>(), "seed for sequence")( + "blocksize,b", po::value<Size>(), "block size (default 2048)")( + "chunksize,c", po::value<Size>(), "chunk size (default 4096)")( + "pool,p", po::value<std::string>(), "pool name")( + "object,o", po::value<std::string>()->default_value("test"), + "object name")("km", po::value<Pair>(), + "k,m EC pool profile (default 2,2)")( + "plugin", po::value<PluginString>(), "EC plugin (isa or jerasure)")( + "objectsize", po::value<Pair>(), + "min,max object size in blocks (default 1,32)")( + "threads,t", po::value<int>(), + "number of threads of I/O per object (default 1)")( + "parallel,p", po::value<int>()->default_value(1), + "number of objects to exercise in parallel")( + "testrecovery", + "Inject errors during sequences to test recovery processes of OSDs")( + "interactive", "interactive mode, execute IO commands from stdin")( + "allow_pool_autoscaling", + "Allows pool autoscaling. Disabled by default.")( + "allow_pool_balancer", "Enables pool balancing. Disabled by default.")( + "allow_pool_deep_scrubbing", + "Enables pool deep scrub. Disabled by default.")( + "allow_pool_scrubbing", "Enables pool scrubbing. Disabled by default."); + + return desc; +} + +int parse_io_seq_options(po::variables_map& vm, int argc, char** argv) { + std::vector<std::string> unrecognized_options; + try { + po::options_description desc = get_options_description(); + + auto parsed = po::command_line_parser(argc, argv) + .options(desc) + .allow_unregistered() + .run(); + po::store(parsed, vm); + po::notify(vm); + unrecognized_options = + po::collect_unrecognized(parsed.options, po::include_positional); + + if (!unrecognized_options.empty()) { + std::stringstream ss; + ss << "Unrecognised command options supplied: "; + while (unrecognized_options.size() > 1) { + ss << unrecognized_options.back().c_str() << ", "; + unrecognized_options.pop_back(); } - } catch(const po::error& e) { - std::cerr << "error: " << e.what() << std::endl; + ss << unrecognized_options.back(); + dout(0) << ss.str() << dendl; return 1; } - - return 0; + } catch (const po::error& e) { + std::cerr << "error: " << e.what() << std::endl; + return 1; } + + return 0; } +template <typename S> +int send_mon_command(S& s, librados::Rados& rados, const char* name, + ceph::buffer::list& inbl, ceph::buffer::list* outbl, Formatter* f) { + std::ostringstream oss; + encode_json(name, s, f); + f->flush(oss); + int rc = rados.mon_command(oss.str(), inbl, outbl, nullptr); + return rc; +} + +} // namespace + template <typename T, int N, const std::array<T, N>& Ts> -ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts> - ::ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - const std::string& option_name, - bool set_forced, - bool select_first) - : rng(rng), - option_name(option_name) { +ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>:: + ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, + po::variables_map vm, const std::string& option_name, + bool set_forced, bool select_first) + : rng(rng), option_name(option_name) { if (set_forced && vm.count(option_name)) { force_value = vm[option_name].as<T>(); } @@ -226,76 +243,54 @@ ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts> } template <typename T, int N, const std::array<T, N>& Ts> -bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced() -{ +bool ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::isForced() { return force_value.has_value(); } template <typename T, int N, const std::array<T, N>& Ts> -const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose() -{ +const T ceph::io_sequence::tester::ProgramOptionSelector<T, N, Ts>::choose() { if (force_value.has_value()) { return *force_value; } else if (first_value.has_value()) { return *std::exchange(first_value, std::nullopt); } else { - return choices[rng(N-1)]; + return choices[rng(N - 1)]; } } - - ceph::io_sequence::tester::SelectObjectSize::SelectObjectSize( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "objectsize", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "objectsize", true, true) {} ceph::io_sequence::tester::SelectBlockSize::SelectBlockSize( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "blocksize", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "blocksize", true, true) {} ceph::io_sequence::tester::SelectNumThreads::SelectNumThreads( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "threads", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "threads", true, true) {} ceph::io_sequence::tester::SelectSeqRange::SelectSeqRange( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "sequence", false, false) -{ + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "sequence", false, false) { if (vm.count(option_name)) { ceph::io_exerciser::Sequence s = - static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>()); + static_cast<ceph::io_exerciser::Sequence>(vm["sequence"].as<int>()); if (s < ceph::io_exerciser::Sequence::SEQUENCE_BEGIN || s >= ceph::io_exerciser::Sequence::SEQUENCE_END) { dout(0) << "Sequence argument out of range" << dendl; throw po::validation_error(po::validation_error::invalid_option_value); } ceph::io_exerciser::Sequence e = s; - force_value = std::make_optional<std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence>>( - std::make_pair(s, ++e)); + force_value = std::make_optional< + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>>( + std::make_pair(s, ++e)); } } -const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence> - ceph::io_sequence::tester::SelectSeqRange::choose() { - if (force_value.has_value()) - { +const std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence> +ceph::io_sequence::tester::SelectSeqRange::choose() { + if (force_value.has_value()) { return *force_value; } else { return std::make_pair(ceph::io_exerciser::Sequence::SEQUENCE_BEGIN, @@ -303,45 +298,34 @@ const std::pair<ceph::io_exerciser::Sequence,ceph::io_exerciser::Sequence> } } - - ceph::io_sequence::tester::SelectErasureKM::SelectErasureKM( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "km", true, true) -{ -} - - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "km", true, true) {} ceph::io_sequence::tester::SelectErasurePlugin::SelectErasurePlugin( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm) - : ProgramOptionSelector(rng, vm, "plugin", true, false) -{ -} - - - -ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm) - : ProgramOptionSelector(rng, vm, "stripe_unit", true, false) -{ -} - + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "plugin", true, false) {} +ceph::io_sequence::tester::SelectErasureChunkSize::SelectErasureChunkSize( + ceph::util::random_number_generator<int>& rng, po::variables_map vm) + : ProgramOptionSelector(rng, vm, "chunksize", true, true) {} ceph::io_sequence::tester::SelectECPool::SelectECPool( - ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - librados::Rados& rados, - bool dry_run) - : ProgramOptionSelector(rng, vm, "pool", false, false), - rados(rados), - dry_run(dry_run), - skm(SelectErasureKM(rng, vm)), - spl(SelectErasurePlugin(rng, vm)), - scs(SelectErasureChunkSize(rng, vm)) -{ + ceph::util::random_number_generator<int>& rng, po::variables_map vm, + librados::Rados& rados, bool dry_run, bool allow_pool_autoscaling, + bool allow_pool_balancer, bool allow_pool_deep_scrubbing, + bool allow_pool_scrubbing, bool test_recovery) + : ProgramOptionSelector(rng, vm, "pool", false, false), + rados(rados), + dry_run(dry_run), + allow_pool_autoscaling(allow_pool_autoscaling), + allow_pool_balancer(allow_pool_balancer), + allow_pool_deep_scrubbing(allow_pool_deep_scrubbing), + allow_pool_scrubbing(allow_pool_scrubbing), + test_recovery(test_recovery), + skm(SelectErasureKM(rng, vm)), + spl(SelectErasurePlugin(rng, vm)), + scs(SelectErasureChunkSize(rng, vm)) { if (!skm.isForced()) { if (vm.count("pool")) { force_value = vm["pool"].as<std::string>(); @@ -349,147 +333,239 @@ ceph::io_sequence::tester::SelectECPool::SelectECPool( } } -const std::string ceph::io_sequence::tester::SelectECPool::choose() -{ - std::pair<int,int> value; +const std::string ceph::io_sequence::tester::SelectECPool::choose() { + std::pair<int, int> value; if (!skm.isForced() && force_value.has_value()) { + int rc; + bufferlist inbl, outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + ceph::messaging::osd::OSDPoolGetRequest osdPoolGetRequest{*force_value}; + rc = send_mon_command(osdPoolGetRequest, rados, "OSDPoolGetRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDPoolGetReply osdPoolGetReply; + osdPoolGetReply.decode_json(&p); + + ceph::messaging::osd::OSDECProfileGetRequest osdECProfileGetRequest{ + osdPoolGetReply.erasure_code_profile}; + rc = send_mon_command(osdECProfileGetRequest, rados, + "OSDECProfileGetRequest", inbl, &outbl, + formatter.get()); + ceph_assert(rc == 0); + + success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDECProfileGetReply reply; + reply.decode_json(&p); + k = reply.k; + m = reply.m; return *force_value; } else { value = skm.choose(); } - int k = value.first; - int m = value.second; + k = value.first; + m = value.second; const std::string plugin = std::string(spl.choose()); const uint64_t chunk_size = scs.choose(); - std::string pool_name = "ec_" + plugin + - "_cs" + std::to_string(chunk_size) + - "_k" + std::to_string(k) + - "_m" + std::to_string(m); - if (!dry_run) - { + std::string pool_name = "ec_" + plugin + "_cs" + std::to_string(chunk_size) + + "_k" + std::to_string(k) + "_m" + std::to_string(m); + if (!dry_run) { create_pool(rados, pool_name, plugin, chunk_size, k, m); } return pool_name; } void ceph::io_sequence::tester::SelectECPool::create_pool( - librados::Rados& rados, - const std::string& pool_name, - const std::string& plugin, - uint64_t chunk_size, - int k, int m) -{ + librados::Rados& rados, const std::string& pool_name, + const std::string& plugin, uint64_t chunk_size, int k, int m) { int rc; bufferlist inbl, outbl; - std::string profile_create = - "{\"prefix\": \"osd erasure-code-profile set\", \ - \"name\": \"testprofile-" + pool_name + "\", \ - \"profile\": [ \"plugin=" + plugin + "\", \ - \"k=" + std::to_string(k) + "\", \ - \"m=" + std::to_string(m) + "\", \ - \"stripe_unit=" + std::to_string(chunk_size) + "\", \ - \"crush-failure-domain=osd\"]}"; - rc = rados.mon_command(profile_create, inbl, &outbl, nullptr); + auto formatter = std::make_unique<JSONFormatter>(false); + + ceph::messaging::osd::OSDECProfileSetRequest ecProfileSetRequest{ + fmt::format("testprofile-{}", pool_name), + {fmt::format("plugin={}", plugin), fmt::format("k={}", k), + fmt::format("m={}", m), fmt::format("stripe_unit={}", chunk_size), + fmt::format("crush-failure-domain=osd")}}; + rc = send_mon_command(ecProfileSetRequest, rados, "OSDECProfileSetRequest", + inbl, &outbl, formatter.get()); ceph_assert(rc == 0); - std::string cmdstr = - "{\"prefix\": \"osd pool create\", \ - \"pool\": \"" + pool_name + "\", \ - \"pool_type\": \"erasure\", \ - \"pg_num\": 8, \ - \"pgp_num\": 8, \ - \"erasure_code_profile\": \"testprofile-" + pool_name + "\"}"; - rc = rados.mon_command(cmdstr, inbl, &outbl, nullptr); + + ceph::messaging::osd::OSDECPoolCreateRequest poolCreateRequest{ + pool_name, "erasure", 8, 8, fmt::format("testprofile-{}", pool_name)}; + rc = send_mon_command(poolCreateRequest, rados, "OSDECPoolCreateRequest", + inbl, &outbl, formatter.get()); ceph_assert(rc == 0); -} + if (allow_pool_autoscaling) { + ceph::messaging::osd::OSDSetRequest setNoAutoscaleRequest{"noautoscale", + std::nullopt}; + rc = send_mon_command(setNoAutoscaleRequest, rados, "OSDSetRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + } + + if (allow_pool_balancer) { + ceph::messaging::balancer::BalancerOffRequest balancerOffRequest{}; + rc = send_mon_command(balancerOffRequest, rados, "BalancerOffRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + ceph::messaging::balancer::BalancerStatusRequest balancerStatusRequest{}; + rc = send_mon_command(balancerStatusRequest, rados, "BalancerStatusRequest", + inbl, &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::balancer::BalancerStatusReply reply; + reply.decode_json(&p); + ceph_assert(!reply.active); + } + if (allow_pool_deep_scrubbing) { + ceph::messaging::osd::OSDSetRequest setNoDeepScrubRequest{"nodeep-scrub", + std::nullopt}; + rc = send_mon_command(setNoDeepScrubRequest, rados, "setNoDeepScrubRequest", + inbl, &outbl, formatter.get()); + ceph_assert(rc == 0); + } + + if (allow_pool_scrubbing) { + ceph::messaging::osd::OSDSetRequest setNoScrubRequest{"noscrub", + std::nullopt}; + rc = send_mon_command(setNoScrubRequest, rados, "OSDSetRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + } + + if (test_recovery) { + ceph::messaging::config::ConfigSetRequest configSetBluestoreDebugRequest{ + "global", "bluestore_debug_inject_read_err", "true", std::nullopt}; + rc = send_mon_command(configSetBluestoreDebugRequest, rados, + "ConfigSetRequest", inbl, &outbl, + formatter.get()); + ceph_assert(rc == 0); + + ceph::messaging::config::ConfigSetRequest configSetMaxMarkdownRequest{ + "global", "osd_max_markdown_count", "99999999", std::nullopt}; + rc = + send_mon_command(configSetMaxMarkdownRequest, rados, "ConfigSetRequest", + inbl, &outbl, formatter.get()); + ceph_assert(rc == 0); + } +} -ceph::io_sequence::tester::TestObject::TestObject( const std::string oid, - librados::Rados& rados, - boost::asio::io_context& asio, - SelectBlockSize& sbs, - SelectECPool& spo, - SelectObjectSize& sos, - SelectNumThreads& snt, - SelectSeqRange& ssr, - ceph::util::random_number_generator<int>& rng, - ceph::mutex& lock, - ceph::condition_variable& cond, - bool dryrun, - bool verbose, - std::optional<int> seqseed) : - rng(rng), verbose(verbose), seqseed(seqseed) -{ +ceph::io_sequence::tester::TestObject::TestObject( + const std::string oid, librados::Rados& rados, + boost::asio::io_context& asio, SelectBlockSize& sbs, SelectECPool& spo, + SelectObjectSize& sos, SelectNumThreads& snt, SelectSeqRange& ssr, + ceph::util::random_number_generator<int>& rng, ceph::mutex& lock, + ceph::condition_variable& cond, bool dryrun, bool verbose, + std::optional<int> seqseed, bool testrecovery) + : rng(rng), verbose(verbose), seqseed(seqseed), testrecovery(testrecovery) { if (dryrun) { - verbose = true; - exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>(oid, - sbs.choose(), - rng()); + exerciser_model = std::make_unique<ceph::io_exerciser::ObjectModel>( + oid, sbs.choose(), rng()); } else { const std::string pool = spo.choose(); + poolK = spo.getChosenK(); + poolM = spo.getChosenM(); + int threads = snt.choose(); - exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, - asio, - pool, - oid, - sbs.choose(), - rng(), - threads, - lock, - cond); - dout(0) << "= " << oid << " pool=" << pool - << " threads=" << threads - << " blocksize=" << exerciser_model->get_block_size() - << " =" << dendl; + + bufferlist inbl, outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + std::optional<std::vector<int>> cached_shard_order = std::nullopt; + + if (!spo.get_allow_pool_autoscaling() && !spo.get_allow_pool_balancer() && + !spo.get_allow_pool_deep_scrubbing() && + !spo.get_allow_pool_scrubbing()) { + ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, oid, ""}; + int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDMapReply reply{}; + reply.decode_json(&p); + cached_shard_order = reply.acting; + } + + exerciser_model = std::make_unique<ceph::io_exerciser::RadosIo>( + rados, asio, pool, oid, cached_shard_order, sbs.choose(), rng(), + threads, lock, cond); + dout(0) << "= " << oid << " pool=" << pool << " threads=" << threads + << " blocksize=" << exerciser_model->get_block_size() << " =" + << dendl; } obj_size_range = sos.choose(); seq_range = ssr.choose(); curseq = seq_range.first; - seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq, - obj_size_range, - seqseed.value_or(rng())); + + if (testrecovery) { + seq = ceph::io_exerciser::EcIoSequence::generate_sequence( + curseq, obj_size_range, poolK, poolM, seqseed.value_or(rng())); + } else { + seq = ceph::io_exerciser::IoSequence::generate_sequence( + curseq, obj_size_range, seqseed.value_or(rng())); + } + op = seq->next(); done = false; - dout(0) << "== " << exerciser_model->get_oid() << " " - << curseq << " " - << seq->get_name() - << " ==" <<dendl; + dout(0) << "== " << exerciser_model->get_oid() << " " << curseq << " " + << seq->get_name_with_seqseed() << " ==" << dendl; } -bool ceph::io_sequence::tester::TestObject::readyForIo() -{ +bool ceph::io_sequence::tester::TestObject::readyForIo() { return exerciser_model->readyForIoOp(*op); } -bool ceph::io_sequence::tester::TestObject::next() -{ +bool ceph::io_sequence::tester::TestObject::next() { if (!done) { if (verbose) { - dout(0) << exerciser_model->get_oid() - << " Step " << seq->get_step() << ": " - << op->to_string(exerciser_model->get_block_size()) << dendl; + dout(0) << exerciser_model->get_oid() << " Step " << seq->get_step() + << ": " << op->to_string(exerciser_model->get_block_size()) + << dendl; } else { - dout(5) << exerciser_model->get_oid() - << " Step " << seq->get_step() << ": " - << op->to_string(exerciser_model->get_block_size()) << dendl; + dout(5) << exerciser_model->get_oid() << " Step " << seq->get_step() + << ": " << op->to_string(exerciser_model->get_block_size()) + << dendl; } exerciser_model->applyIoOp(*op); - if (op->done()) { - ++curseq; - if (curseq == seq_range.second) { + if (op->getOpType() == ceph::io_exerciser::OpType::Done) { + curseq = seq->getNextSupportedSequenceId(); + if (curseq >= seq_range.second) { done = true; dout(0) << exerciser_model->get_oid() << " Number of IOs = " << exerciser_model->get_num_io() << dendl; } else { - seq = ceph::io_exerciser::IoSequence::generate_sequence(curseq, - obj_size_range, - seqseed.value_or(rng())); - dout(0) << "== " << exerciser_model->get_oid() << " " - << curseq << " " << seq->get_name() - << " ==" <<dendl; + if (testrecovery) { + seq = ceph::io_exerciser::EcIoSequence::generate_sequence( + curseq, obj_size_range, poolK, poolM, seqseed.value_or(rng())); + } else { + seq = ceph::io_exerciser::IoSequence::generate_sequence( + curseq, obj_size_range, seqseed.value_or(rng())); + } + + dout(0) << "== " << exerciser_model->get_oid() << " " << curseq << " " + << seq->get_name_with_seqseed() << " ==" << dendl; op = seq->next(); } } else { @@ -499,27 +575,30 @@ bool ceph::io_sequence::tester::TestObject::next() return done; } -bool ceph::io_sequence::tester::TestObject::finished() -{ - return done; -} +bool ceph::io_sequence::tester::TestObject::finished() { return done; } -int ceph::io_sequence::tester::TestObject::get_num_io() -{ +int ceph::io_sequence::tester::TestObject::get_num_io() { return exerciser_model->get_num_io(); } ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm, - librados::Rados& rados) : - rados(rados), - seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)), - rng(ceph::util::random_number_generator<int>(seed)), - sbs{rng, vm}, - sos{rng, vm}, - spo{rng, vm, rados, vm.contains("dryrun")}, - snt{rng, vm}, - ssr{rng, vm} -{ + librados::Rados& rados) + : rados(rados), + seed(vm.contains("seed") ? vm["seed"].as<int>() : time(nullptr)), + rng(ceph::util::random_number_generator<int>(seed)), + sbs{rng, vm}, + sos{rng, vm}, + spo{rng, + vm, + rados, + vm.contains("dryrun"), + vm.contains("allow_pool_autoscaling"), + vm.contains("allow_pool_balancer"), + vm.contains("allow_pool_deep_scrubbing"), + vm.contains("allow_pool_scrubbing"), + vm.contains("test_recovery")}, + snt{rng, vm}, + ssr{rng, vm} { dout(0) << "Test using seed " << seed << dendl; verbose = vm.contains("verbose"); @@ -532,19 +611,23 @@ ceph::io_sequence::tester::TestRunner::TestRunner(po::variables_map& vm, num_objects = vm["parallel"].as<int>(); object_name = vm["object"].as<std::string>(); interactive = vm.contains("interactive"); + testrecovery = vm.contains("testrecovery"); + + allow_pool_autoscaling = vm.contains("allow_pool_autoscaling"); + allow_pool_balancer = vm.contains("allow_pool_balancer"); + allow_pool_deep_scrubbing = vm.contains("allow_pool_deep_scrubbing"); + allow_pool_scrubbing = vm.contains("allow_pool_scrubbing"); - if (!dryrun) - { + if (!dryrun) { guard.emplace(boost::asio::make_work_guard(asio)); - thread = make_named_thread("io_thread",[&asio = asio] { asio.run(); }); + thread = make_named_thread("io_thread", [&asio = asio] { asio.run(); }); } show_help = vm.contains("help"); show_sequence = vm.contains("listsequence"); } -ceph::io_sequence::tester::TestRunner::~TestRunner() -{ +ceph::io_sequence::tester::TestRunner::~TestRunner() { if (!dryrun) { guard = std::nullopt; asio.stop(); @@ -553,34 +636,38 @@ ceph::io_sequence::tester::TestRunner::~TestRunner() } } -void ceph::io_sequence::tester::TestRunner::help() -{ +void ceph::io_sequence::tester::TestRunner::help() { std::cout << get_options_description() << std::endl; for (auto line : usage) { std::cout << line << std::endl; } } -void ceph::io_sequence::tester::TestRunner::list_sequence() -{ +void ceph::io_sequence::tester::TestRunner::list_sequence(bool testrecovery) { // List seqeunces - std::pair<int,int> obj_size_range = sos.choose(); - for (ceph::io_exerciser::Sequence s - = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN; - s < ceph::io_exerciser::Sequence::SEQUENCE_END; ++s) { - std::unique_ptr<ceph::io_exerciser::IoSequence> seq = - ceph::io_exerciser::IoSequence::generate_sequence(s, - obj_size_range, - seqseed.value_or(rng())); - dout(0) << s << " " << seq->get_name() << dendl; + std::pair<int, int> obj_size_range = sos.choose(); + ceph::io_exerciser::Sequence s = ceph::io_exerciser::Sequence::SEQUENCE_BEGIN; + std::unique_ptr<ceph::io_exerciser::IoSequence> seq; + if (testrecovery) { + seq = ceph::io_exerciser::EcIoSequence::generate_sequence( + s, obj_size_range, spo.getChosenK(), spo.getChosenM(), + seqseed.value_or(rng())); + } else { + seq = ceph::io_exerciser::IoSequence::generate_sequence( + s, obj_size_range, seqseed.value_or(rng())); } + + do { + dout(0) << s << " " << seq->get_name_with_seqseed() << dendl; + s = seq->getNextSupportedSequenceId(); + } while (s != ceph::io_exerciser::Sequence::SEQUENCE_END); } -std::string ceph::io_sequence::tester::TestRunner::get_token() -{ - static std::string line; - static ceph::split split = ceph::split(""); - static ceph::spliterator tokens; +void ceph::io_sequence::tester::TestRunner::clear_tokens() { + tokens = split.end(); +} + +std::string ceph::io_sequence::tester::TestRunner::get_token() { while (line.empty() || tokens == split.end()) { if (!std::getline(std::cin, line)) { throw std::runtime_error("End of input"); @@ -591,127 +678,211 @@ std::string ceph::io_sequence::tester::TestRunner::get_token() return std::string(*tokens++); } -uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token() -{ +std::optional<std::string> +ceph::io_sequence::tester::TestRunner ::get_optional_token() { + std::optional<std::string> ret = std::nullopt; + if (tokens != split.end()) { + ret = std::string(*tokens++); + } + return ret; +} + +uint64_t ceph::io_sequence::tester::TestRunner::get_numeric_token() { std::string parse_error; std::string token = get_token(); uint64_t num = strict_iecstrtoll(token, &parse_error); if (!parse_error.empty()) { - throw std::runtime_error("Invalid number "+token); + throw std::runtime_error("Invalid number " + token); } return num; } -bool ceph::io_sequence::tester::TestRunner::run_test() -{ - if (show_help) - { +std::optional<uint64_t> +ceph::io_sequence::tester::TestRunner ::get_optional_numeric_token() { + std::string parse_error; + std::optional<std::string> token = get_optional_token(); + if (token) { + uint64_t num = strict_iecstrtoll(*token, &parse_error); + if (!parse_error.empty()) { + throw std::runtime_error("Invalid number " + *token); + } + return num; + } + + return std::optional<uint64_t>(std::nullopt); +} + +bool ceph::io_sequence::tester::TestRunner::run_test() { + if (show_help) { help(); return true; - } - else if (show_sequence) - { - list_sequence(); + } else if (show_sequence) { + list_sequence(testrecovery); return true; - } - else if (interactive) - { + } else if (interactive) { return run_interactive_test(); - } - else - { + } else { return run_automated_test(); } } -bool ceph::io_sequence::tester::TestRunner::run_interactive_test() -{ +bool ceph::io_sequence::tester::TestRunner::run_interactive_test() { bool done = false; std::unique_ptr<ceph::io_exerciser::IoOp> ioop; std::unique_ptr<ceph::io_exerciser::Model> model; if (dryrun) { - model = std::make_unique<ceph::io_exerciser::ObjectModel>(object_name, - sbs.choose(), - rng()); + model = std::make_unique<ceph::io_exerciser::ObjectModel>( + object_name, sbs.choose(), rng()); } else { const std::string pool = spo.choose(); - model = std::make_unique<ceph::io_exerciser::RadosIo>(rados, asio, pool, - object_name, sbs.choose(), - rng(), 1, // 1 thread - lock, cond); + + bufferlist inbl, outbl; + auto formatter = std::make_unique<JSONFormatter>(false); + + ceph::messaging::osd::OSDMapRequest osdMapRequest{pool, object_name, ""}; + int rc = send_mon_command(osdMapRequest, rados, "OSDMapRequest", inbl, + &outbl, formatter.get()); + ceph_assert(rc == 0); + + JSONParser p; + bool success = p.parse(outbl.c_str(), outbl.length()); + ceph_assert(success); + + ceph::messaging::osd::OSDMapReply reply{}; + reply.decode_json(&p); + + model = std::make_unique<ceph::io_exerciser::RadosIo>( + rados, asio, pool, object_name, reply.acting, sbs.choose(), rng(), + 1, // 1 thread + lock, cond); } while (!done) { const std::string op = get_token(); - if (!op.compare("done") || !op.compare("q") || !op.compare("quit")) { - ioop = ceph::io_exerciser::IoOp::generate_done(); - } else if (!op.compare("create")) { - ioop = ceph::io_exerciser::IoOp::generate_create(get_numeric_token()); - } else if (!op.compare("remove") || !op.compare("delete")) { - ioop = ceph::io_exerciser::IoOp::generate_remove(); - } else if (!op.compare("read")) { + if (op == "done" || op == "q" || op == "quit") { + ioop = ceph::io_exerciser::DoneOp::generate(); + } else if (op == "create") { + ioop = ceph::io_exerciser::CreateOp::generate(get_numeric_token()); + } else if (op == "remove" || op == "delete") { + ioop = ceph::io_exerciser::RemoveOp::generate(); + } else if (op == "read") { uint64_t offset = get_numeric_token(); uint64_t length = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_read(offset, length); - } else if (!op.compare("read2")) { + ioop = ceph::io_exerciser::SingleReadOp::generate(offset, length); + } else if (op == "read2") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_read2(offset1, length1, - offset2, length2); - } else if (!op.compare("read3")) { + ioop = DoubleReadOp::generate(offset1, length1, offset2, length2); + } else if (op == "read3") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); uint64_t offset3 = get_numeric_token(); uint64_t length3 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_read3(offset1, length1, - offset2, length2, - offset3, length3); - } else if (!op.compare("write")) { + ioop = TripleReadOp::generate(offset1, length1, offset2, length2, offset3, + length3); + } else if (op == "write") { uint64_t offset = get_numeric_token(); uint64_t length = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_write(offset, length); - } else if (!op.compare("write2")) { + ioop = SingleWriteOp::generate(offset, length); + } else if (op == "write2") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_write2(offset1, length1, - offset2, length2); - } else if (!op.compare("write3")) { + ioop = DoubleWriteOp::generate(offset1, length1, offset2, length2); + } else if (op == "write3") { uint64_t offset1 = get_numeric_token(); uint64_t length1 = get_numeric_token(); uint64_t offset2 = get_numeric_token(); uint64_t length2 = get_numeric_token(); uint64_t offset3 = get_numeric_token(); uint64_t length3 = get_numeric_token(); - ioop = ceph::io_exerciser::IoOp::generate_write3(offset1, length1, - offset2, length2, - offset3, length3); + ioop = TripleWriteOp::generate(offset1, length1, offset2, length2, + offset3, length3); + } else if (op == "failedwrite") { + uint64_t offset = get_numeric_token(); + uint64_t length = get_numeric_token(); + ioop = SingleFailedWriteOp::generate(offset, length); + } else if (op == "failedwrite2") { + uint64_t offset1 = get_numeric_token(); + uint64_t length1 = get_numeric_token(); + uint64_t offset2 = get_numeric_token(); + uint64_t length2 = get_numeric_token(); + ioop = DoubleFailedWriteOp::generate(offset1, length1, offset2, length2); + } else if (op == "failedwrite3") { + uint64_t offset1 = get_numeric_token(); + uint64_t length1 = get_numeric_token(); + uint64_t offset2 = get_numeric_token(); + uint64_t length2 = get_numeric_token(); + uint64_t offset3 = get_numeric_token(); + uint64_t length3 = get_numeric_token(); + ioop = TripleFailedWriteOp::generate(offset1, length1, offset2, length2, + offset3, length3); + } else if (op == "injecterror") { + std::string inject_type = get_token(); + int shard = get_numeric_token(); + std::optional<int> type = get_optional_numeric_token(); + std::optional<int> when = get_optional_numeric_token(); + std::optional<int> duration = get_optional_numeric_token(); + if (inject_type == "read") { + ioop = ceph::io_exerciser::InjectReadErrorOp::generate(shard, type, + when, duration); + } else if (inject_type == "write") { + ioop = ceph::io_exerciser::InjectWriteErrorOp::generate(shard, type, + when, duration); + } else { + clear_tokens(); + ioop.reset(); + dout(0) << fmt::format("Invalid error inject {}. No action performed.", + inject_type) + << dendl; + } + } else if (op == "clearinject") { + std::string inject_type = get_token(); + int shard = get_numeric_token(); + std::optional<int> type = get_optional_numeric_token(); + if (inject_type == "read") { + ioop = + ceph::io_exerciser::ClearReadErrorInjectOp::generate(shard, type); + } else if (inject_type == "write") { + ioop = + ceph::io_exerciser::ClearWriteErrorInjectOp::generate(shard, type); + } else { + clear_tokens(); + ioop.reset(); + dout(0) << fmt::format("Invalid error inject {}. No action performed.", + inject_type) + << dendl; + } } else { - throw std::runtime_error("Invalid operation "+op); + clear_tokens(); + ioop.reset(); + dout(0) << fmt::format("Invalid op {}. No action performed.", op) + << dendl; } - dout(0) << ioop->to_string(model->get_block_size()) << dendl; - model->applyIoOp(*ioop); - done = ioop->done(); - if (!done) { - ioop = ceph::io_exerciser::IoOp::generate_barrier(); + if (ioop) { + dout(0) << ioop->to_string(model->get_block_size()) << dendl; model->applyIoOp(*ioop); + done = ioop->getOpType() == ceph::io_exerciser::OpType::Done; + if (!done) { + ioop = ceph::io_exerciser::BarrierOp::generate(); + model->applyIoOp(*ioop); + } } } return true; } -bool ceph::io_sequence::tester::TestRunner::run_automated_test() -{ +bool ceph::io_sequence::tester::TestRunner::run_automated_test() { // Create a test for each object - std::vector<std::shared_ptr< - ceph::io_sequence::tester::TestObject>> test_objects; + std::vector<std::shared_ptr<ceph::io_sequence::tester::TestObject>> + test_objects; for (int obj = 0; obj < num_objects; obj++) { std::string name; @@ -721,15 +892,9 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() name = object_name + std::to_string(obj); } test_objects.push_back( - std::make_shared<ceph::io_sequence::tester::TestObject>( - name, - rados, asio, - sbs, spo, sos, snt, ssr, - rng, lock, cond, - dryrun, verbose, - seqseed - ) - ); + std::make_shared<ceph::io_sequence::tester::TestObject>( + name, rados, asio, sbs, spo, sos, snt, ssr, rng, lock, cond, dryrun, + verbose, seqseed, testrecovery)); } if (!dryrun) { rados.wait_for_latest_osdmap(); @@ -748,16 +913,15 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() for (auto obj = test_objects.begin(); obj != test_objects.end(); ++obj) { std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj; if (!to->finished()) { - lock.lock(); - bool ready = to->readyForIo(); - lock.unlock(); - if (ready) - { - to->next(); - started_io = true; - } else { - need_wait = true; - } + lock.lock(); + bool ready = to->readyForIo(); + lock.unlock(); + if (ready) { + to->next(); + started_io = true; + } else { + need_wait = true; + } } } if (!started_io && need_wait) { @@ -767,8 +931,7 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() std::shared_ptr<ceph::io_sequence::tester::TestObject> to = *obj; if (!to->finished()) { need_wait = !to->readyForIo(); - if (!need_wait) - { + if (!need_wait) { break; } } @@ -788,18 +951,16 @@ bool ceph::io_sequence::tester::TestRunner::run_automated_test() return true; } -int main(int argc, char **argv) -{ +int main(int argc, char** argv) { auto args = argv_to_vec(argc, argv); env_to_vec(args); auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, - CODE_ENVIRONMENT_UTILITY, 0); + CODE_ENVIRONMENT_UTILITY, 0); common_init_finish(cct.get()); po::variables_map vm; int rc = parse_io_seq_options(vm, argc, argv); - if (rc != 0) - { + if (rc != 0) { return rc; } @@ -814,7 +975,7 @@ int main(int argc, char **argv) std::unique_ptr<ceph::io_sequence::tester::TestRunner> runner; try { runner = std::make_unique<ceph::io_sequence::tester::TestRunner>(vm, rados); - } catch(const po::error& e) { + } catch (const po::error& e) { return 1; } runner->run_test(); diff --git a/src/test/osd/ceph_test_rados_io_sequence.h b/src/test/osd/ceph_test_rados_io_sequence.h index 4e21d025700..9af5f706b2f 100644 --- a/src/test/osd/ceph_test_rados_io_sequence.h +++ b/src/test/osd/ceph_test_rados_io_sequence.h @@ -1,34 +1,36 @@ +#include <boost/program_options.hpp> +#include <optional> #include <utility> -#include "include/random.h" - -#include "global/global_init.h" -#include "global/global_context.h" - #include "common/io_exerciser/IoOp.h" #include "common/io_exerciser/IoSequence.h" #include "common/io_exerciser/Model.h" - +#include "common/split.h" +#include "global/global_context.h" +#include "global/global_init.h" +#include "include/random.h" #include "librados/librados_asio.h" #include <boost/asio/io_context.hpp> #include <boost/program_options.hpp> +#include <optional> + /* Overview * * class ProgramOptionSelector - * Base class for selector objects below with common code for + * Base class for selector objects below with common code for * selecting options - * + * * class SelectObjectSize * Selects min and max object sizes for a test * * class SelectErasureKM * Selects an EC k and m value for a test - * + * * class SelectErasurePlugin * Selects an plugin for a test - * + * * class SelectECPool * Selects an EC pool (plugin,k and m) for a test. Also creates the * pool as well. @@ -58,287 +60,279 @@ namespace po = boost::program_options; -namespace ceph -{ - namespace io_sequence::tester - { - // Choices for min and max object size - inline constexpr size_t objectSizeSize = 10; - inline constexpr std::array<std::pair<int,int>,objectSizeSize> - objectSizeChoices = {{ - {1,32}, // Default - best for boundary checking - {12,14}, - {28,30}, - {36,38}, - {42,44}, - {52,54}, - {66,68}, - {72,74}, - {83,83}, - {97,97} - }}; - - // Choices for block size - inline constexpr int blockSizeSize = 5; - inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = {{ - 2048, // Default - test boundaries for EC 4K chunk size - 512, - 3767, - 4096, - 32768 - }}; - - // Choices for number of threads - inline constexpr int threadArraySize = 4; - inline constexpr std::array<int, threadArraySize> threadCountChoices = {{ - 1, // Default - 2, - 4, - 8 - }}; - - // Choices for EC k+m profile - inline constexpr int kmSize = 6; - inline constexpr std::array<std::pair<int,int>, kmSize> kmChoices = {{ - {2,2}, // Default - reasonable coverage - {2,1}, - {2,3}, - {3,2}, - {4,2}, - {5,1} - }}; - - // Choices for EC chunk size - inline constexpr int chunkSizeSize = 3; - inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = {{ - 4*1024, - 64*1024, - 256*1024 - }}; - - // Choices for plugin - inline constexpr int pluginListSize = 2; - inline constexpr std::array<std::string_view, - pluginListSize> pluginChoices = {{ - "jerasure", - "isa" - }}; - - inline constexpr std::array<std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence>, - 0> sequencePairs = {{}}; - - inline constexpr std::array<std::string, 0> poolChoices = {{}}; - - template <typename T, int N, const std::array<T, N>& Ts> - class ProgramOptionSelector - { - public: - ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - const std::string& option_name, - bool set_forced, - bool select_first - ); - virtual ~ProgramOptionSelector() = default; - bool isForced(); - virtual const T choose(); - - protected: - ceph::util::random_number_generator<int>& rng; - static constexpr std::array<T, N> choices = Ts; - - std::optional<T> force_value; - std::optional<T> first_value; - - std::string option_name; - }; - - class SelectObjectSize - : public ProgramOptionSelector<std::pair<int, int>, - io_sequence::tester::objectSizeSize, - io_sequence::tester::objectSizeChoices> - { - public: - SelectObjectSize(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectBlockSize - : public ProgramOptionSelector<uint64_t, - io_sequence::tester::blockSizeSize, - io_sequence::tester::blockSizeChoices> - { - public: - SelectBlockSize(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectNumThreads - : public ProgramOptionSelector<int, - io_sequence::tester::threadArraySize, - io_sequence::tester::threadCountChoices> - { - public: - SelectNumThreads(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectSeqRange - : public ProgramOptionSelector<std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence>, - 0, io_sequence::tester::sequencePairs> - { - public: - SelectSeqRange(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - - const std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence> choose() override; - }; - - class SelectErasureKM - : public ProgramOptionSelector<std::pair<int,int>, - io_sequence::tester::kmSize, - io_sequence::tester::kmChoices> - { - public: - SelectErasureKM(ceph::util::random_number_generator<int>& rng, +namespace ceph { +namespace io_sequence::tester { +// Choices for min and max object size +inline constexpr size_t objectSizeSize = 10; +inline constexpr std::array<std::pair<int, int>, objectSizeSize> + objectSizeChoices = {{{1, 32}, // Default - best for boundary checking + {12, 14}, + {28, 30}, + {36, 38}, + {42, 44}, + {52, 54}, + {66, 68}, + {72, 74}, + {83, 83}, + {97, 97}}}; + +// Choices for block size +inline constexpr int blockSizeSize = 5; +inline constexpr std::array<uint64_t, blockSizeSize> blockSizeChoices = { + {2048, // Default - test boundaries for EC 4K chunk size + 512, 3767, 4096, 32768}}; + +// Choices for number of threads +inline constexpr int threadArraySize = 4; +inline constexpr std::array<int, threadArraySize> threadCountChoices = { + {1, // Default + 2, 4, 8}}; + +// Choices for EC k+m profile +inline constexpr int kmSize = 6; +inline constexpr std::array<std::pair<int, int>, kmSize> kmChoices = { + {{2, 2}, // Default - reasonable coverage + {2, 1}, + {2, 3}, + {3, 2}, + {4, 2}, + {5, 1}}}; + +// Choices for EC chunk size +inline constexpr int chunkSizeSize = 3; +inline constexpr std::array<uint64_t, chunkSizeSize> chunkSizeChoices = { + {4 * 1024, 64 * 1024, 256 * 1024}}; + +// Choices for plugin +inline constexpr int pluginListSize = 2; +inline constexpr std::array<std::string_view, pluginListSize> pluginChoices = { + {"jerasure", "isa"}}; + +inline constexpr std::array< + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>, 0> + sequencePairs = {{}}; + +inline constexpr std::array<std::string, 0> poolChoices = {{}}; + +template <typename T, int N, const std::array<T, N>& Ts> +class ProgramOptionSelector { + public: + ProgramOptionSelector(ceph::util::random_number_generator<int>& rng, + po::variables_map vm, const std::string& option_name, + bool set_forced, bool select_first); + virtual ~ProgramOptionSelector() = default; + bool isForced(); + virtual const T choose(); + + protected: + ceph::util::random_number_generator<int>& rng; + static constexpr std::array<T, N> choices = Ts; + + std::optional<T> force_value; + std::optional<T> first_value; + + std::string option_name; +}; + +class SelectObjectSize + : public ProgramOptionSelector<std::pair<int, int>, + io_sequence::tester::objectSizeSize, + io_sequence::tester::objectSizeChoices> { + public: + SelectObjectSize(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectBlockSize + : public ProgramOptionSelector<uint64_t, io_sequence::tester::blockSizeSize, + io_sequence::tester::blockSizeChoices> { + public: + SelectBlockSize(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectNumThreads + : public ProgramOptionSelector<int, io_sequence::tester::threadArraySize, + io_sequence::tester::threadCountChoices> { + public: + SelectNumThreads(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectSeqRange + : public ProgramOptionSelector< + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence>, + 0, io_sequence::tester::sequencePairs> { + public: + SelectSeqRange(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); + + const std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence> + choose() override; +}; + +class SelectErasureKM + : public ProgramOptionSelector<std::pair<int, int>, + io_sequence::tester::kmSize, + io_sequence::tester::kmChoices> { + public: + SelectErasureKM(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectErasurePlugin + : public ProgramOptionSelector<std::string_view, + io_sequence::tester::pluginListSize, + io_sequence::tester::pluginChoices> { + public: + SelectErasurePlugin(ceph::util::random_number_generator<int>& rng, po::variables_map vm); - }; - - class SelectErasurePlugin - : public ProgramOptionSelector<std::string_view, - io_sequence::tester::pluginListSize, - io_sequence::tester::pluginChoices> - { - public: - SelectErasurePlugin(ceph::util::random_number_generator<int>& rng, - po::variables_map vm); - }; - - class SelectErasureChunkSize - : public ProgramOptionSelector<uint64_t, - io_sequence::tester::chunkSizeSize, - io_sequence::tester::chunkSizeChoices> - { - public: - SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, po::variables_map vm); - }; - - class SelectECPool - : public ProgramOptionSelector<std::string, - 0, - io_sequence::tester::poolChoices> - { - public: - SelectECPool(ceph::util::random_number_generator<int>& rng, - po::variables_map vm, - librados::Rados& rados, - bool dry_run); - const std::string choose() override; - - private: - void create_pool(librados::Rados& rados, - const std::string& pool_name, - const std::string& plugin, - uint64_t chunk_size, - int k, int m); - - protected: - librados::Rados& rados; - bool dry_run; - - SelectErasureKM skm; - SelectErasurePlugin spl; - SelectErasureChunkSize scs; - }; - - class TestObject - { - public: - TestObject( const std::string oid, - librados::Rados& rados, - boost::asio::io_context& asio, - ceph::io_sequence::tester::SelectBlockSize& sbs, - ceph::io_sequence::tester::SelectECPool& spl, - ceph::io_sequence::tester::SelectObjectSize& sos, - ceph::io_sequence::tester::SelectNumThreads& snt, - ceph::io_sequence::tester::SelectSeqRange& ssr, - ceph::util::random_number_generator<int>& rng, - ceph::mutex& lock, - ceph::condition_variable& cond, - bool dryrun, - bool verbose, - std::optional<int> seqseed); - - int get_num_io(); - bool readyForIo(); - bool next(); - bool finished(); - - protected: - std::unique_ptr<ceph::io_exerciser::Model> exerciser_model; - std::pair<int,int> obj_size_range; - std::pair<ceph::io_exerciser::Sequence, - ceph::io_exerciser::Sequence> seq_range; - ceph::io_exerciser::Sequence curseq; - std::unique_ptr<ceph::io_exerciser::IoSequence> seq; - std::unique_ptr<ceph::io_exerciser::IoOp> op; - bool done; - ceph::util::random_number_generator<int>& rng; - bool verbose; - std::optional<int> seqseed; - }; - - class TestRunner - { - public: - TestRunner(po::variables_map& vm, librados::Rados& rados); - ~TestRunner(); - - bool run_test(); - - private: - librados::Rados& rados; - int seed; - ceph::util::random_number_generator<int> rng; - - ceph::io_sequence::tester::SelectBlockSize sbs; - ceph::io_sequence::tester::SelectObjectSize sos; - ceph::io_sequence::tester::SelectECPool spo; - ceph::io_sequence::tester::SelectNumThreads snt; - ceph::io_sequence::tester::SelectSeqRange ssr; - - boost::asio::io_context asio; - std::thread thread; - std::optional<boost::asio::executor_work_guard< - boost::asio::io_context::executor_type>> guard; - ceph::mutex lock = ceph::make_mutex("RadosIo::lock"); - ceph::condition_variable cond; - - bool input_valid; - - bool verbose; - bool dryrun; - std::optional<int> seqseed; - bool interactive; - - bool show_sequence; - bool show_help; - - int num_objects; - std::string object_name; - - std::string get_token(); - uint64_t get_numeric_token(); - - bool run_automated_test(); - - bool run_interactive_test(); - - void help(); - void list_sequence(); - }; - } -} +}; + +class SelectErasureChunkSize + : public ProgramOptionSelector<uint64_t, io_sequence::tester::chunkSizeSize, + io_sequence::tester::chunkSizeChoices> { + public: + SelectErasureChunkSize(ceph::util::random_number_generator<int>& rng, + po::variables_map vm); +}; + +class SelectECPool + : public ProgramOptionSelector<std::string, 0, + io_sequence::tester::poolChoices> { + public: + SelectECPool(ceph::util::random_number_generator<int>& rng, + po::variables_map vm, librados::Rados& rados, bool dry_run, + bool allow_pool_autoscaling, bool allow_pool_balancer, + bool allow_pool_deep_scrubbing, bool allow_pool_scrubbing, + bool test_recovery); + const std::string choose() override; + + bool get_allow_pool_autoscaling() { return allow_pool_autoscaling; } + bool get_allow_pool_balancer() { return allow_pool_balancer; } + bool get_allow_pool_deep_scrubbing() { return allow_pool_deep_scrubbing; } + bool get_allow_pool_scrubbing() { return allow_pool_scrubbing; } + int getChosenK() const { return k; } + int getChosenM() const { return m; } + + private: + void create_pool(librados::Rados& rados, const std::string& pool_name, + const std::string& plugin, uint64_t chunk_size, int k, + int m); + + protected: + librados::Rados& rados; + bool dry_run; + bool allow_pool_autoscaling; + bool allow_pool_balancer; + bool allow_pool_deep_scrubbing; + bool allow_pool_scrubbing; + bool test_recovery; + int k; + int m; + + SelectErasureKM skm; + SelectErasurePlugin spl; + SelectErasureChunkSize scs; +}; + +class TestObject { + public: + TestObject(const std::string oid, librados::Rados& rados, + boost::asio::io_context& asio, + ceph::io_sequence::tester::SelectBlockSize& sbs, + ceph::io_sequence::tester::SelectECPool& spl, + ceph::io_sequence::tester::SelectObjectSize& sos, + ceph::io_sequence::tester::SelectNumThreads& snt, + ceph::io_sequence::tester::SelectSeqRange& ssr, + ceph::util::random_number_generator<int>& rng, ceph::mutex& lock, + ceph::condition_variable& cond, bool dryrun, bool verbose, + std::optional<int> seqseed, bool testRecovery); + + int get_num_io(); + bool readyForIo(); + bool next(); + bool finished(); + + protected: + std::unique_ptr<ceph::io_exerciser::Model> exerciser_model; + std::pair<int, int> obj_size_range; + std::pair<ceph::io_exerciser::Sequence, ceph::io_exerciser::Sequence> + seq_range; + ceph::io_exerciser::Sequence curseq; + std::unique_ptr<ceph::io_exerciser::IoSequence> seq; + std::unique_ptr<ceph::io_exerciser::IoOp> op; + bool done; + ceph::util::random_number_generator<int>& rng; + bool verbose; + std::optional<int> seqseed; + int poolK; + int poolM; + bool testrecovery; +}; + +class TestRunner { + public: + TestRunner(po::variables_map& vm, librados::Rados& rados); + ~TestRunner(); + + bool run_test(); + + private: + librados::Rados& rados; + int seed; + ceph::util::random_number_generator<int> rng; + + ceph::io_sequence::tester::SelectBlockSize sbs; + ceph::io_sequence::tester::SelectObjectSize sos; + ceph::io_sequence::tester::SelectECPool spo; + ceph::io_sequence::tester::SelectNumThreads snt; + ceph::io_sequence::tester::SelectSeqRange ssr; + + boost::asio::io_context asio; + std::thread thread; + std::optional< + boost::asio::executor_work_guard<boost::asio::io_context::executor_type>> + guard; + ceph::mutex lock = ceph::make_mutex("RadosIo::lock"); + ceph::condition_variable cond; + + bool input_valid; + + bool verbose; + bool dryrun; + std::optional<int> seqseed; + bool interactive; + + bool testrecovery; + + bool allow_pool_autoscaling; + bool allow_pool_balancer; + bool allow_pool_deep_scrubbing; + bool allow_pool_scrubbing; + + bool show_sequence; + bool show_help; + + int num_objects; + std::string object_name; + + std::string line; + ceph::split split = ceph::split(""); + ceph::spliterator tokens; + + void clear_tokens(); + std::string get_token(); + std::optional<std::string> get_optional_token(); + uint64_t get_numeric_token(); + std::optional<uint64_t> get_optional_numeric_token(); + + bool run_automated_test(); + + bool run_interactive_test(); + + void help(); + void list_sequence(bool testrecovery); +}; +} // namespace io_sequence::tester +} // namespace ceph diff --git a/src/test/pybind/pytest.ini b/src/test/pybind/pytest.ini index dccf2a346dc..97569e88299 100644 --- a/src/test/pybind/pytest.ini +++ b/src/test/pybind/pytest.ini @@ -7,3 +7,4 @@ markers = stats tier watch + wait diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py index cb2a4f96101..25423bd8dcb 100644 --- a/src/test/pybind/test_rados.py +++ b/src/test/pybind/test_rados.py @@ -207,7 +207,7 @@ class TestRados(object): def test_get_fsid(self): fsid = self.rados.get_fsid() - assert re.match('[0-9a-f\-]{36}', fsid, re.I) + assert re.match(r'[0-9a-f\-]{36}', fsid, re.I) def test_blocklist_add(self): self.rados.blocklist_add("1.2.3.4/123", 1) diff --git a/src/test/rgw/rgw_multi/tests.py b/src/test/rgw/rgw_multi/tests.py index d95feb5aa95..433cd034fe0 100644 --- a/src/test/rgw/rgw_multi/tests.py +++ b/src/test/rgw/rgw_multi/tests.py @@ -15,6 +15,7 @@ import boto import boto.s3.connection from boto.s3.website import WebsiteConfiguration from boto.s3.cors import CORSConfiguration +from botocore.exceptions import ClientError from nose.tools import eq_ as eq from nose.tools import assert_not_equal, assert_equal, assert_true, assert_false @@ -3638,4 +3639,23 @@ def test_copy_object_different_bucket(): CopySource = source_bucket.name + '/' + objname) zonegroup_bucket_checkpoint(zonegroup_conns, dest_bucket.name) - + +def test_bucket_create_location_constraint(): + for zonegroup in realm.current_period.zonegroups: + zonegroup_conns = ZonegroupConns(zonegroup) + for zg in realm.current_period.zonegroups: + z = zonegroup_conns.rw_zones[0] + bucket_name = gen_bucket_name() + if zg.name == zonegroup.name: + # my zonegroup should pass + z.s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': zg.name}) + # check bucket location + response = z.s3_client.get_bucket_location(Bucket=bucket_name) + assert_equal(response['LocationConstraint'], zg.name) + else: + # other zonegroup should fail with 400 + e = assert_raises(ClientError, + z.s3_client.create_bucket, + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': zg.name}) + assert e.response['ResponseMetadata']['HTTPStatusCode'] == 400 diff --git a/src/test/rgw/test_rgw_iam_policy.cc b/src/test/rgw/test_rgw_iam_policy.cc index 7dadb7812ff..1d13c2aa013 100644 --- a/src/test/rgw/test_rgw_iam_policy.cc +++ b/src/test/rgw/test_rgw_iam_policy.cc @@ -75,6 +75,8 @@ using rgw::IAM::s3GetObjectTagging; using rgw::IAM::s3GetObjectVersion; using rgw::IAM::s3GetObjectVersionTagging; using rgw::IAM::s3GetObjectVersionTorrent; +using rgw::IAM::s3GetObjectAttributes; +using rgw::IAM::s3GetObjectVersionAttributes; using rgw::IAM::s3GetPublicAccessBlock; using rgw::IAM::s3GetReplicationConfiguration; using rgw::IAM::s3ListAllMyBuckets; @@ -419,6 +421,8 @@ TEST_F(PolicyTest, Parse3) { act2[s3GetObjectVersionAcl] = 1; act2[s3GetObjectTorrent] = 1; act2[s3GetObjectVersionTorrent] = 1; + act2[s3GetObjectAttributes] = 1; + act2[s3GetObjectVersionAttributes] = 1; act2[s3GetAccelerateConfiguration] = 1; act2[s3GetBucketAcl] = 1; act2[s3GetBucketOwnershipControls] = 1; @@ -487,6 +491,8 @@ TEST_F(PolicyTest, Eval3) { s3allow[s3GetObjectVersion] = 1; s3allow[s3GetObjectAcl] = 1; s3allow[s3GetObjectVersionAcl] = 1; + s3allow[s3GetObjectAttributes] = 1; + s3allow[s3GetObjectVersionAttributes] = 1; s3allow[s3GetObjectTorrent] = 1; s3allow[s3GetObjectVersionTorrent] = 1; s3allow[s3GetAccelerateConfiguration] = 1; @@ -883,6 +889,8 @@ TEST_F(ManagedPolicyTest, AmazonS3ReadOnlyAccess) act[s3GetObjectVersionAcl] = 1; act[s3GetObjectTorrent] = 1; act[s3GetObjectVersionTorrent] = 1; + act[s3GetObjectAttributes] = 1; + act[s3GetObjectVersionAttributes] = 1; act[s3GetAccelerateConfiguration] = 1; act[s3GetBucketAcl] = 1; act[s3GetBucketOwnershipControls] = 1; diff --git a/src/test/test_ipaddr.cc b/src/test/test_ipaddr.cc index 49038815318..21df1d4056b 100644 --- a/src/test/test_ipaddr.cc +++ b/src/test/test_ipaddr.cc @@ -995,3 +995,158 @@ TEST(pick_address, ipv4_ipv6_enabled2) ASSERT_EQ(-1, r); } } + +// Test for IPv4 address +TEST(is_addr_in_subnet, ipv4) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr; + addr.parse("10.1.1.2", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(true, r); +} + +// Test for IPv6 address +TEST(is_addr_in_subnet, ipv6) +{ + std::string public_network = "2001:db8::/64"; + entity_addr_t addr; + addr.parse("2001:db8::1", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv6", "true"); + cct->_conf.set_val("ms_bind_ipv4", "false"); + + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(true, r); +} + +// Test for invalid address +TEST(is_addr_in_subnet, invalid_address) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr; + addr.parse("192.168.1.1", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(false, r); +} + +// Test for malformed address +TEST(is_addr_in_subnet, malformed_address) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr; + addr.parse("invalid_address", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + // Test with a malformed address + bool r = is_addr_in_subnet(cct.get(), public_network, addr); + ASSERT_EQ(false, r); +} + +TEST(is_addr_in_subnet, boundary_ipv4) +{ + std::string public_network = "10.1.1.0/24"; + entity_addr_t addr_low; + addr_low.parse("10.1.1.0", nullptr); + entity_addr_t addr_high; + addr_high.parse("10.1.1.255", nullptr); + entity_addr_t addr_out; + addr_out.parse("10.1.2.0", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_low)); + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_high)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network, addr_out)); +} + +TEST(is_addr_in_subnet, boundary_ipv6) +{ + std::string public_network = "2001:db8::/64"; + entity_addr_t addr_low; + addr_low.parse("2001:db8::", nullptr); + entity_addr_t addr_high; + addr_high.parse("2001:db8:0:0:ffff:ffff:ffff:ffff", nullptr); + entity_addr_t addr_out; + addr_out.parse("2001:db9::", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv6", "true"); + cct->_conf.set_val("ms_bind_ipv4", "false"); + + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_low)); + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network, addr_high)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network, addr_out)); +} + +TEST(is_addr_in_subnet, overlapping_subnets) +{ + std::string public_network_1 = "10.1.1.0/24"; + std::string public_network_2 = "10.1.2.0/24"; + entity_addr_t addr; + addr.parse("10.1.1.5", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "false"); + + ASSERT_TRUE(is_addr_in_subnet(cct.get(), public_network_1, addr)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr)); +} + +TEST(is_addr_in_subnet, mismatched_family) +{ + std::string public_network_1 = "2001:db8::/64"; + entity_addr_t addr_1; + addr_1.parse("10.1.1.5", nullptr); + + std::string public_network_2 = "10.1.1.0/24"; + entity_addr_t addr_2; + addr_2.parse("2001:db8::1", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + cct->_conf.set_val("ms_bind_ipv4", "true"); + cct->_conf.set_val("ms_bind_ipv6", "true"); + + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_1, addr_1)); + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr_2)); +} + +TEST(is_addr_in_subnet, invalid_subnets) +{ + std::string public_network_1 = "10.1.1.0/33"; + std::string public_network_2 = "25.0.0.99/10"; + entity_addr_t addr; + addr.parse("10.1.1.2", nullptr); + + boost::intrusive_ptr<CephContext> cct(new CephContext(CEPH_ENTITY_TYPE_OSD), false); + cct->_conf._clear_safe_to_start_threads(); + + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_1, addr)); // Invalid prefix + ASSERT_FALSE(is_addr_in_subnet(cct.get(), public_network_2, addr)); // Invalid subnet string +} + diff --git a/src/vstart.sh b/src/vstart.sh index b445e250a00..a992f33c856 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -159,6 +159,7 @@ smallmds=0 short=0 crimson=0 ec=0 +cephexporter=0 cephadm=0 parallel=true restart=1 @@ -233,6 +234,7 @@ options: -G disable Kerberos/GSSApi authentication --hitset <pool> <hit_set_type>: enable hitset tracking -e : create an erasure pool + --cephexporter: start the ceph-exporter daemon -o config add extra config parameters to all sections --rgw_port specify ceph rgw http listen port --rgw_frontend specify the rgw frontend configuration @@ -372,6 +374,9 @@ case $1 in -e) ec=1 ;; + --cephexporter) + cephexporter=1 + ;; --new | -n) new=1 ;; @@ -1140,6 +1145,17 @@ EOF fi } +start_cephexporter() { + debug echo "Starting Ceph exporter daemon..." + + # Define socket directory for the exporter + # Start the exporter daemon + prunb ceph-exporter \ + -c "$conf_fn" \ + --sock-dir "$CEPH_ASOK_DIR" \ + --addrs "$IP" +} + start_osd() { if [ $inc_osd_num -gt 0 ]; then old_maxosd=$($CEPH_BIN/ceph osd getmaxosd | sed -e 's/max_osd = //' -e 's/ in epoch.*//') @@ -1738,6 +1754,10 @@ if [ $CEPH_NUM_MDS -gt 0 ]; then ceph_adm fs authorize \* "client.fs" / rwp >> "$keyring_fn" fi +if [ "$cephexporter" -eq 1 ]; then + start_cephexporter +fi + # Don't set max_mds until all the daemons are started, otherwise # the intended standbys might end up in active roles. if [ "$CEPH_MAX_MDS" -gt 1 ]; then |