summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to '')
-rwxr-xr-xsrc/cephadm/cephadm.py12
-rw-r--r--src/cephadm/cephadmlib/constants.py3
-rw-r--r--src/cephadm/cephadmlib/daemons/mgmt_gateway.py24
-rw-r--r--src/cephadm/cephadmlib/daemons/smb.py78
-rw-r--r--src/cls/rbd/cls_rbd_client.cc74
-rw-r--r--src/cls/rbd/cls_rbd_client.h10
-rw-r--r--src/common/options/mds.yaml.in11
-rw-r--r--src/common/options/osd.yaml.in64
-rw-r--r--src/common/options/rgw.yaml.in8
-rw-r--r--src/crimson/common/gated.h85
-rw-r--r--src/crimson/net/io_handler.h2
-rw-r--r--src/crimson/os/alienstore/alien_store.cc33
-rw-r--r--src/crimson/os/alienstore/alien_store.h6
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.cc2
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_node.h2
-rw-r--r--src/crimson/os/seastore/cache.cc168
-rw-r--r--src/crimson/os/seastore/cache.h166
-rw-r--r--src/crimson/os/seastore/cached_extent.h2
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc2
-rw-r--r--src/crimson/os/seastore/seastore.cc24
-rw-r--r--src/crimson/os/seastore/seastore_types.cc74
-rw-r--r--src/crimson/os/seastore/seastore_types.h131
-rw-r--r--src/crimson/os/seastore/transaction_manager.h4
-rw-r--r--src/crimson/osd/ec_backend.cc4
-rw-r--r--src/crimson/osd/object_metadata_helper.cc21
-rw-r--r--src/crimson/osd/ops_executer.cc60
-rw-r--r--src/crimson/osd/ops_executer.h96
-rw-r--r--src/crimson/osd/osd.cc16
-rw-r--r--src/crimson/osd/osd.h4
-rw-r--r--src/crimson/osd/osd_operations/background_recovery.cc3
-rw-r--r--src/crimson/osd/osd_operations/background_recovery.h12
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.cc66
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.h8
-rw-r--r--src/crimson/osd/pg.cc120
-rw-r--r--src/crimson/osd/pg.h16
-rw-r--r--src/crimson/osd/pg_backend.h3
-rw-r--r--src/crimson/osd/pg_recovery.cc28
-rw-r--r--src/crimson/osd/pg_recovery.h3
-rw-r--r--src/crimson/osd/pg_recovery_listener.h4
-rw-r--r--src/crimson/osd/replicated_backend.cc21
-rw-r--r--src/include/ceph_fs.h1
-rw-r--r--src/include/cephfs/types.h2
-rw-r--r--src/include/config-h.in.cmake3
-rw-r--r--src/librbd/CMakeLists.txt10
-rw-r--r--src/librbd/api/Group.cc53
-rw-r--r--src/librbd/api/Group.h1
-rw-r--r--src/librbd/group/ListSnapshotsRequest.cc187
-rw-r--r--src/librbd/group/ListSnapshotsRequest.h93
-rw-r--r--src/librbd/librbd.cc12
-rw-r--r--src/librbd/migration/FileStream.cc12
-rw-r--r--src/librbd/migration/FileStream.h4
-rw-r--r--src/librbd/migration/HttpStream.cc12
-rw-r--r--src/librbd/migration/HttpStream.h4
-rw-r--r--src/librbd/migration/NBDStream.cc370
-rw-r--r--src/librbd/migration/NBDStream.h67
-rw-r--r--src/librbd/migration/RawSnapshot.cc10
-rw-r--r--src/librbd/migration/S3Stream.cc12
-rw-r--r--src/librbd/migration/S3Stream.h4
-rw-r--r--src/librbd/migration/SourceSpecBuilder.cc7
-rw-r--r--src/librbd/migration/StreamInterface.h4
-rw-r--r--src/mds/CDir.cc7
-rw-r--r--src/mds/CInode.cc34
-rw-r--r--src/mds/CInode.h15
-rw-r--r--src/mds/DamageTable.cc59
-rw-r--r--src/mds/DamageTable.h16
-rw-r--r--src/mds/MDCache.cc195
-rw-r--r--src/mds/MDCache.h8
-rw-r--r--src/mds/MDSDaemon.cc5
-rw-r--r--src/mds/MDSRank.cc19
-rw-r--r--src/mds/MDSRank.h1
-rw-r--r--src/mds/ScrubHeader.h52
-rw-r--r--src/mds/ScrubStack.cc183
-rw-r--r--src/mds/ScrubStack.h31
-rw-r--r--src/messages/MMDSScrubStats.h109
-rwxr-xr-xsrc/mon/NVMeofGwMap.cc212
-rwxr-xr-xsrc/mon/NVMeofGwMap.h9
-rw-r--r--src/mon/NVMeofGwMon.cc102
-rw-r--r--src/mon/NVMeofGwMon.h3
-rwxr-xr-xsrc/mon/NVMeofGwSerialize.h3
-rwxr-xr-xsrc/mon/NVMeofGwTypes.h10
-rw-r--r--src/osd/PG.h1
-rw-r--r--src/osd/PeeringState.cc2
-rw-r--r--src/osd/PeeringState.h1
-rw-r--r--src/osd/SnapMapper.cc44
-rw-r--r--src/osd/SnapMapper.h4
-rw-r--r--src/osd/scrubber/pg_scrubber.cc10
-rw-r--r--src/osd/scrubber/scrub_job.cc31
-rw-r--r--src/osd/scrubber/scrub_job.h6
-rw-r--r--src/osd/scrubber/scrub_machine.cc4
-rw-r--r--src/osd/scrubber/scrub_machine.h4
-rw-r--r--src/osd/scrubber_common.h4
-rw-r--r--src/pybind/mgr/cephadm/module.py24
-rw-r--r--src/pybind/mgr/cephadm/service_discovery.py17
-rw-r--r--src/pybind/mgr/cephadm/services/monitoring.py4
-rw-r--r--src/pybind/mgr/cephadm/services/nvmeof.py13
-rw-r--r--src/pybind/mgr/cephadm/services/smb.py6
-rw-r--r--src/pybind/mgr/cephadm/ssh.py8
-rw-r--r--src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j22
-rw-r--r--src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j220
-rw-r--r--src/pybind/mgr/cephadm/tests/test_service_discovery.py18
-rw-r--r--src/pybind/mgr/cephadm/tests/test_services.py21
-rw-r--r--src/pybind/mgr/dashboard/controllers/host.py13
-rw-r--r--src/pybind/mgr/dashboard/controllers/nvmeof.py38
-rw-r--r--src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts2
-rw-r--r--src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts4
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html3
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html3
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts28
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html6
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts8
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts11
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html4
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts4
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts3
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts4
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts4
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts4
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html111
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts140
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts2
-rw-r--r--src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts2
-rw-r--r--src/pybind/mgr/dashboard/openapi.yaml50
-rw-r--r--src/pybind/mgr/dashboard/services/nvmeof_cli.py4
-rw-r--r--src/pybind/mgr/dashboard/services/nvmeof_client.py19
-rw-r--r--src/pybind/mgr/dashboard/services/nvmeof_conf.py82
-rw-r--r--src/pybind/mgr/smb/enums.py17
-rw-r--r--src/pybind/mgr/smb/handler.py24
-rw-r--r--src/pybind/mgr/smb/tests/test_smb.py16
-rw-r--r--src/pybind/mgr/snap_schedule/fs/schedule_client.py2
-rw-r--r--src/python-common/ceph/deployment/service_spec.py6
-rw-r--r--src/rgw/driver/daos/rgw_sal_daos.cc15
-rw-r--r--src/rgw/driver/daos/rgw_sal_daos.h11
-rw-r--r--src/rgw/driver/dbstore/common/dbstore.cc42
-rw-r--r--src/rgw/driver/dbstore/common/dbstore.h20
-rw-r--r--src/rgw/driver/dbstore/sqlite/sqliteDB.cc22
-rw-r--r--src/rgw/driver/dbstore/tests/dbstore_tests.cc63
-rw-r--r--src/rgw/driver/motr/rgw_sal_motr.cc20
-rw-r--r--src/rgw/driver/motr/rgw_sal_motr.h12
-rw-r--r--src/rgw/driver/posix/rgw_sal_posix.cc18
-rw-r--r--src/rgw/driver/posix/rgw_sal_posix.h12
-rw-r--r--src/rgw/driver/rados/rgw_bucket.cc2
-rw-r--r--src/rgw/driver/rados/rgw_notify.cc35
-rw-r--r--src/rgw/driver/rados/rgw_object_expirer_core.cc2
-rw-r--r--src/rgw/driver/rados/rgw_putobj_processor.cc6
-rw-r--r--src/rgw/driver/rados/rgw_rados.cc18
-rw-r--r--src/rgw/driver/rados/rgw_rados.h7
-rw-r--r--src/rgw/driver/rados/rgw_sal_rados.cc284
-rw-r--r--src/rgw/driver/rados/rgw_sal_rados.h32
-rw-r--r--src/rgw/rgw_admin.cc14
-rw-r--r--src/rgw/rgw_asio_frontend.cc123
-rw-r--r--src/rgw/rgw_kafka.cc29
-rw-r--r--src/rgw/rgw_lc.cc156
-rw-r--r--src/rgw/rgw_lc.h14
-rw-r--r--src/rgw/rgw_op.cc61
-rw-r--r--src/rgw/rgw_process.cc20
-rw-r--r--src/rgw/rgw_sal.h98
-rw-r--r--src/rgw/rgw_sal_dbstore.cc32
-rw-r--r--src/rgw/rgw_sal_dbstore.h29
-rw-r--r--src/rgw/rgw_sal_filter.cc77
-rw-r--r--src/rgw/rgw_sal_filter.h59
-rw-r--r--src/rgw/rgw_sal_store.h70
m---------src/seastar0
-rw-r--r--src/test/cli/rbd/help.t5
-rw-r--r--src/test/crimson/test_calc_subsets.cc1
-rw-r--r--src/test/librados_test_stub/LibradosTestStub.cc7
-rw-r--r--src/test/librbd/CMakeLists.txt5
-rw-r--r--src/test/librbd/migration/test_mock_FileStream.cc28
-rw-r--r--src/test/librbd/migration/test_mock_HttpStream.cc31
-rw-r--r--src/test/librbd/migration/test_mock_NBDStream.cc730
-rw-r--r--src/test/librbd/migration/test_mock_RawSnapshot.cc19
-rw-r--r--src/test/librbd/migration/test_mock_S3Stream.cc31
-rw-r--r--src/test/librbd/mock/migration/MockStreamInterface.h8
-rw-r--r--src/test/librbd/test_Groups.cc102
-rw-r--r--src/test/objectstore/store_test.cc125
-rw-r--r--src/test/rgw/test_rgw_posix_driver.cc6
-rw-r--r--src/tools/rbd/action/Bench.cc56
-rw-r--r--src/tools/rbd_ggate/ggate_drv.c2
185 files changed, 5425 insertions, 1419 deletions
diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py
index e71addf7bfa..1ab98a0ac4f 100755
--- a/src/cephadm/cephadm.py
+++ b/src/cephadm/cephadm.py
@@ -595,6 +595,8 @@ def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional
if digest and not digest.endswith('@'):
logger.info(f"Using ceph image with id '{image_id}' and tag '{tag}' created on {created_date}\n{digest}")
return digest
+ if container_info is not None:
+ logger.warning(f"Not using image '{container_info.image_id}' as it's not in list of non-dangling images with ceph=True label")
return None
@@ -2408,6 +2410,12 @@ def enable_cephadm_mgr_module(
logger.info('Enabling cephadm module...')
cli(['mgr', 'module', 'enable', 'cephadm'])
wait_for_mgr_restart()
+ # https://tracker.ceph.com/issues/67969
+ # luckily `ceph mgr module enable <module>` returns
+ # a zero rc when the module is already enabled so
+ # this is no issue even if it is unnecessary
+ logger.info('Verifying orchestrator module is enabled...')
+ cli(['mgr', 'module', 'enable', 'orchestrator'])
logger.info('Setting orchestrator backend to cephadm...')
cli(['orch', 'set', 'backend', 'cephadm'])
@@ -3568,7 +3576,7 @@ def list_daemons(
elif daemon_type == 'grafana':
out, err, code = call(ctx,
[container_path, 'exec', container_id,
- 'grafana-server', '-v'],
+ 'grafana', 'server', '-v'],
verbosity=CallVerbosity.QUIET)
if not code and \
out.startswith('Version '):
@@ -4074,7 +4082,7 @@ def command_adopt_grafana(ctx, daemon_id, fsid):
ports = Monitoring.port_map['grafana']
endpoints = [EndPoint('0.0.0.0', p) for p in ports]
- _stop_and_disable(ctx, 'grafana-server')
+ _stop_and_disable(ctx, 'grafana server')
ident = DaemonIdentity(fsid, daemon_type, daemon_id)
data_dir_dst = make_data_dir(
diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py
index 37df1d48f43..f26cd361b58 100644
--- a/src/cephadm/cephadmlib/constants.py
+++ b/src/cephadm/cephadmlib/constants.py
@@ -19,7 +19,8 @@ DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
DEFAULT_SMB_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
-DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:1.26.1'
+DEFAULT_SMBMETRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
+DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
DEFAULT_OAUTH2_PROXY_IMAGE = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this
# ------------------------------------------------------------------------------
diff --git a/src/cephadm/cephadmlib/daemons/mgmt_gateway.py b/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
index b0a6f0579d2..01c68a83ba8 100644
--- a/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
+++ b/src/cephadm/cephadmlib/daemons/mgmt_gateway.py
@@ -5,13 +5,13 @@ import re
from ..call_wrappers import call, CallVerbosity
from ..container_daemon_form import ContainerDaemonForm, daemon_to_container
-from ..container_types import CephContainer
+from ..container_types import CephContainer, extract_uid_gid
from ..context import CephadmContext
from ..context_getters import fetch_configs
from ..daemon_form import register as register_daemon_form
from ..daemon_identity import DaemonIdentity
from ..deployment_utils import to_deployment_container
-from ..constants import DEFAULT_NGINX_IMAGE, UID_NOBODY, GID_NOGROUP
+from ..constants import DEFAULT_NGINX_IMAGE
from ..data_utils import dict_get, is_fsid
from ..file_utils import populate_files, makedirs, recursive_chown
from ..exceptions import Error
@@ -90,7 +90,7 @@ class MgmtGateway(ContainerDaemonForm):
return to_deployment_container(ctx, ctr)
def uid_gid(self, ctx: CephadmContext) -> Tuple[int, int]:
- return UID_NOBODY, GID_NOGROUP
+ return extract_uid_gid(ctx, file_path='/etc/nginx/')
def get_daemon_args(self) -> List[str]:
return []
@@ -150,6 +150,24 @@ class MgmtGateway(ContainerDaemonForm):
version = match.group(1)
return version
+ def customize_container_args(
+ self, ctx: CephadmContext, args: List[str]
+ ) -> None:
+ uid, _ = self.uid_gid(ctx)
+ extra_args = [
+ '--user',
+ str(uid),
+ ]
+ args.extend(extra_args)
+
+ def customize_process_args(
+ self, ctx: CephadmContext, args: List[str]
+ ) -> None:
+ # The following noqa comment is intentional to suppress warnings about using double quotes
+ # instead of single quotes. We use double quotes here to ensure that single quotes are
+ # used in the final parsed output: nginx -g 'daemon off;'
+ args.extend(['nginx', '-g', "daemon off;"]) # noqa
+
def customize_container_mounts(
self, ctx: CephadmContext, mounts: Dict[str, str]
) -> None:
diff --git a/src/cephadm/cephadmlib/daemons/smb.py b/src/cephadm/cephadmlib/daemons/smb.py
index 0aecd418b1b..ae9acbc9c45 100644
--- a/src/cephadm/cephadmlib/daemons/smb.py
+++ b/src/cephadm/cephadmlib/daemons/smb.py
@@ -81,6 +81,8 @@ class Config:
smb_port: int
ceph_config_entity: str
vhostname: str
+ metrics_image: str
+ metrics_port: int
# clustering related values
rank: int
rank_generation: int
@@ -103,6 +105,8 @@ class Config:
smb_port: int = 0,
ceph_config_entity: str = 'client.admin',
vhostname: str = '',
+ metrics_image: str = '',
+ metrics_port: int = 0,
rank: int = -1,
rank_generation: int = -1,
cluster_meta_uri: str = '',
@@ -122,6 +126,8 @@ class Config:
self.smb_port = smb_port
self.ceph_config_entity = ceph_config_entity
self.vhostname = vhostname
+ self.metrics_image = metrics_image
+ self.metrics_port = metrics_port
self.rank = rank
self.rank_generation = rank_generation
self.cluster_meta_uri = cluster_meta_uri
@@ -155,15 +161,34 @@ def _container_dns_args(cfg: Config) -> List[str]:
return cargs
-class SambaContainerCommon:
- def __init__(
- self,
- cfg: Config,
- ) -> None:
+class ContainerCommon:
+ def __init__(self, cfg: Config, image: str = '') -> None:
self.cfg = cfg
+ self.image = image
def name(self) -> str:
- raise NotImplementedError('samba container name')
+ raise NotImplementedError('container name')
+
+ def envs(self) -> Dict[str, str]:
+ return {}
+
+ def envs_list(self) -> List[str]:
+ return []
+
+ def args(self) -> List[str]:
+ return []
+
+ def container_args(self) -> List[str]:
+ return []
+
+ def container_image(self) -> str:
+ return self.image
+
+
+class SambaContainerCommon(ContainerCommon):
+ def __init__(self, cfg: Config, image: str = '') -> None:
+ self.cfg = cfg
+ self.image = image
def envs(self) -> Dict[str, str]:
environ = {
@@ -196,9 +221,6 @@ class SambaContainerCommon:
args.append(f'--debug-delay={self.cfg.debug_delay}')
return args
- def container_args(self) -> List[str]:
- return []
-
class SambaNetworkedInitContainer(SambaContainerCommon):
"""SambaContainerCommon subclass that enables additional networking
@@ -233,6 +255,9 @@ class SMBDContainer(SambaContainerCommon):
cargs = []
if self.cfg.smb_port:
cargs.append(f'--publish={self.cfg.smb_port}:{self.cfg.smb_port}')
+ if self.cfg.metrics_port:
+ metrics_port = self.cfg.metrics_port
+ cargs.append(f'--publish={metrics_port}:{metrics_port}')
cargs.extend(_container_dns_args(self.cfg))
return cargs
@@ -284,6 +309,17 @@ class ConfigWatchContainer(SambaContainerCommon):
return super().args() + ['update-config', '--watch']
+class SMBMetricsContainer(ContainerCommon):
+ def name(self) -> str:
+ return 'smbmetrics'
+
+ def args(self) -> List[str]:
+ args = []
+ if self.cfg.metrics_port > 0:
+ args.append(f'--port={self.cfg.metrics_port}')
+ return args
+
+
class CTDBMigrateInitContainer(SambaContainerCommon):
def name(self) -> str:
return 'ctdbMigrate'
@@ -358,13 +394,13 @@ class CTDBNodeMonitorContainer(SambaContainerCommon):
class ContainerLayout:
init_containers: List[SambaContainerCommon]
primary: SambaContainerCommon
- supplemental: List[SambaContainerCommon]
+ supplemental: List[ContainerCommon]
def __init__(
self,
init_containers: List[SambaContainerCommon],
primary: SambaContainerCommon,
- supplemental: List[SambaContainerCommon],
+ supplemental: List[ContainerCommon],
) -> None:
self.init_containers = init_containers
self.primary = primary
@@ -393,6 +429,7 @@ class SMB(ContainerDaemonForm):
self._cached_layout: Optional[ContainerLayout] = None
self._rank_info = context_getters.fetch_rank_info(ctx)
self.smb_port = 445
+ self.metrics_port = 9922
self._network_mapper = _NetworkMapper(ctx)
logger.debug('Created SMB ContainerDaemonForm instance')
@@ -431,6 +468,8 @@ class SMB(ContainerDaemonForm):
files = data_utils.dict_get(configs, 'files', {})
ceph_config_entity = configs.get('config_auth_entity', '')
vhostname = configs.get('virtual_hostname', '')
+ metrics_image = configs.get('metrics_image', '')
+ metrics_port = int(configs.get('metrics_port', '0'))
cluster_meta_uri = configs.get('cluster_meta_uri', '')
cluster_lock_uri = configs.get('cluster_lock_uri', '')
cluster_public_addrs = configs.get('cluster_public_addrs', [])
@@ -470,6 +509,8 @@ class SMB(ContainerDaemonForm):
smb_port=self.smb_port,
ceph_config_entity=ceph_config_entity,
vhostname=vhostname,
+ metrics_image=metrics_image,
+ metrics_port=metrics_port,
cluster_meta_uri=cluster_meta_uri,
cluster_lock_uri=cluster_lock_uri,
cluster_public_addrs=_public_addrs,
@@ -517,7 +558,7 @@ class SMB(ContainerDaemonForm):
if self._cached_layout:
return self._cached_layout
init_ctrs: List[SambaContainerCommon] = []
- ctrs: List[SambaContainerCommon] = []
+ ctrs: List[ContainerCommon] = []
init_ctrs.append(ConfigInitContainer(self._cfg))
ctrs.append(ConfigWatchContainer(self._cfg))
@@ -526,6 +567,11 @@ class SMB(ContainerDaemonForm):
init_ctrs.append(MustJoinContainer(self._cfg))
ctrs.append(WinbindContainer(self._cfg))
+ metrics_image = self._cfg.metrics_image.strip()
+ metrics_port = self._cfg.metrics_port
+ if metrics_image and metrics_port > 0:
+ ctrs.append(SMBMetricsContainer(self._cfg, metrics_image))
+
if self._cfg.clustered:
init_ctrs += [
CTDBMigrateInitContainer(self._cfg),
@@ -564,7 +610,7 @@ class SMB(ContainerDaemonForm):
)
def _to_sidecar_container(
- self, ctx: CephadmContext, smb_ctr: SambaContainerCommon
+ self, ctx: CephadmContext, smb_ctr: ContainerCommon
) -> SidecarContainer:
volume_mounts: Dict[str, str] = {}
container_args: List[str] = smb_ctr.container_args()
@@ -587,10 +633,11 @@ class SMB(ContainerDaemonForm):
identity = DaemonSubIdentity.from_parent(
self.identity, smb_ctr.name()
)
+ img = smb_ctr.container_image() or ctx.image or self.default_image
return SidecarContainer(
ctx,
entrypoint='',
- image=ctx.image or self.default_image,
+ image=img,
identity=identity,
container_args=container_args,
args=smb_ctr.args(),
@@ -673,6 +720,9 @@ class SMB(ContainerDaemonForm):
) -> None:
if not any(ep.port == self.smb_port for ep in endpoints):
endpoints.append(EndPoint('0.0.0.0', self.smb_port))
+ if self.metrics_port > 0:
+ if not any(ep.port == self.metrics_port for ep in endpoints):
+ endpoints.append(EndPoint('0.0.0.0', self.metrics_port))
def prepare_data_dir(self, data_dir: str, uid: int, gid: int) -> None:
self.validate()
diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc
index ad480c47d5c..458bfd985c3 100644
--- a/src/cls/rbd/cls_rbd_client.cc
+++ b/src/cls/rbd/cls_rbd_client.cc
@@ -2757,28 +2757,65 @@ int group_snap_get_by_id(librados::IoCtx *ioctx, const std::string &oid,
return 0;
}
+
+void group_snap_list_start(librados::ObjectReadOperation *op,
+ const cls::rbd::GroupSnapshot &start,
+ uint64_t max_return)
+{
+ bufferlist bl;
+ encode(start, bl);
+ encode(max_return, bl);
+
+ op->exec("rbd", "group_snap_list", bl);
+}
+
+int group_snap_list_finish(bufferlist::const_iterator *iter,
+ std::vector<cls::rbd::GroupSnapshot> *snapshots)
+{
+ try {
+ decode(*snapshots, *iter);
+ } catch (const ceph::buffer::error &err) {
+ return -EBADMSG;
+ }
+ return 0;
+}
+
int group_snap_list(librados::IoCtx *ioctx, const std::string &oid,
const cls::rbd::GroupSnapshot &start,
uint64_t max_return,
std::vector<cls::rbd::GroupSnapshot> *snapshots)
{
- using ceph::encode;
- using ceph::decode;
- bufferlist inbl, outbl;
- encode(start, inbl);
- encode(max_return, inbl);
+ librados::ObjectReadOperation op;
+ group_snap_list_start(&op, start, max_return);
- int r = ioctx->exec(oid, "rbd", "group_snap_list", inbl, outbl);
+ bufferlist out_bl;
+ int r = ioctx->operate(oid, &op, &out_bl);
if (r < 0) {
return r;
}
- auto iter = outbl.cbegin();
+
+ auto it = out_bl.cbegin();
+ return group_snap_list_finish(&it, snapshots);
+}
+
+void group_snap_list_order_start(librados::ObjectReadOperation *op,
+ const std::string &start,
+ uint64_t max_return)
+{
+ bufferlist bl;
+ encode(start, bl);
+ encode(max_return, bl);
+ op->exec("rbd", "group_snap_list_order", bl);
+}
+
+int group_snap_list_order_finish(bufferlist::const_iterator *iter,
+ std::map<std::string, uint64_t> *snap_order)
+{
try {
- decode(*snapshots, iter);
+ decode(*snap_order, *iter);
} catch (const ceph::buffer::error &err) {
return -EBADMSG;
}
-
return 0;
}
@@ -2786,24 +2823,17 @@ int group_snap_list_order(librados::IoCtx *ioctx, const std::string &oid,
const std::string &start, uint64_t max_return,
std::map<std::string, uint64_t> *snap_order)
{
- using ceph::encode;
- using ceph::decode;
- bufferlist inbl, outbl;
- encode(start, inbl);
- encode(max_return, inbl);
+ librados::ObjectReadOperation op;
+ group_snap_list_order_start(&op, start, max_return);
- int r = ioctx->exec(oid, "rbd", "group_snap_list_order", inbl, outbl);
+ bufferlist out_bl;
+ int r = ioctx->operate(oid, &op, &out_bl);
if (r < 0) {
return r;
}
- auto iter = outbl.cbegin();
- try {
- decode(*snap_order, iter);
- } catch (const ceph::buffer::error &err) {
- return -EBADMSG;
- }
- return 0;
+ auto it = out_bl.cbegin();
+ return group_snap_list_order_finish(&it, snap_order);
}
// rbd_trash functions
diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h
index 4005c51836c..b1553bd1f17 100644
--- a/src/cls/rbd/cls_rbd_client.h
+++ b/src/cls/rbd/cls_rbd_client.h
@@ -580,10 +580,20 @@ int group_snap_remove(librados::IoCtx *ioctx, const std::string &oid,
int group_snap_get_by_id(librados::IoCtx *ioctx, const std::string &oid,
const std::string &snap_id,
cls::rbd::GroupSnapshot *snapshot);
+void group_snap_list_start(librados::ObjectReadOperation *op,
+ const cls::rbd::GroupSnapshot &start,
+ uint64_t max_return);
+int group_snap_list_finish(ceph::buffer::list::const_iterator *iter,
+ std::vector<cls::rbd::GroupSnapshot> *snapshots);
int group_snap_list(librados::IoCtx *ioctx, const std::string &oid,
const cls::rbd::GroupSnapshot &start,
uint64_t max_return,
std::vector<cls::rbd::GroupSnapshot> *snapshots);
+void group_snap_list_order_start(librados::ObjectReadOperation *op,
+ const std::string &start_snap_id,
+ uint64_t max_return);
+int group_snap_list_order_finish(ceph::buffer::list::const_iterator *iter,
+ std::map<std::string, uint64_t> *snap_order);
int group_snap_list_order(librados::IoCtx *ioctx, const std::string &oid,
const std::string &snap_id, uint64_t max_return,
std::map<std::string, uint64_t> *snap_order);
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
index dcf3eaac0d6..18efba561ed 100644
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -1666,6 +1666,17 @@ options:
- mds
flags:
- runtime
+- name: mds_scrub_stats_review_period
+ type: uint
+ level: advanced
+ desc: Period for which scrub stats will be available for review.
+ long_desc: Number of days for which scrub stats will be available for review since
+ start of scrub operation. After this period, the stats will be auto purged.
+ These stats will not be saved to the disk. So any restart or failover of mds
+ will cause stats to be lost forever.
+ default: 1
+ min: 1
+ max: 60
- name: mds_session_metadata_threshold
type: size
level: advanced
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
index 268a89154de..8dae546521d 100644
--- a/src/common/options/osd.yaml.in
+++ b/src/common/options/osd.yaml.in
@@ -520,6 +520,70 @@ options:
stats (inc. scrub/block duration) every this many seconds.
default: 120
with_legacy: false
+- name: osd_scrub_retry_delay
+ type: int
+ level: advanced
+ desc: Period (in seconds) before retrying a PG that has failed a prior scrub.
+ long_desc: Minimum delay after a failed attempt to scrub a PG. The delay is
+ either applied to one of the scheduled scrubs for the PG (the next shallow
+ scrub or the next deep scrub), or to both.
+ This is a default value, used when the cause of the delay does not have an
+ associated configuration option. See the 'see also' for the configuration
+ options for some delay reasons that have their own configuration.
+ default: 30
+ min: 1
+ see_also:
+ - osd_scrub_retry_pg_state
+ - osd_scrub_retry_after_noscrub
+ - osd_scrub_retry_new_interval
+ - osd_scrub_retry_trimming
+ with_legacy: false
+- name: osd_scrub_retry_after_noscrub
+ type: int
+ level: advanced
+ desc: Period (in seconds) before retrying to scrub a PG at a specific level
+ after detecting a no-scrub or no-deep-scrub flag
+ long_desc: Minimum delay after a failed attempt to scrub a PG at a level
+ (shallow or deep) that is disabled by cluster or pool no-scrub or no-deep-scrub
+ flags.
+ default: 60
+ min: 1
+ see_also:
+ - osd_scrub_retry_delay
+ with_legacy: false
+- name: osd_scrub_retry_pg_state
+ type: int
+ level: advanced
+ desc: Period (in seconds) before retrying to scrub a previously inactive/not-clean PG
+ long_desc: Minimum delay after a failed attempt to scrub a PG that is not
+ active and clean.
+ default: 60
+ min: 1
+ see_also:
+ - osd_scrub_retry_delay
+ with_legacy: false
+- name: osd_scrub_retry_trimming
+ type: int
+ level: advanced
+ desc: Period (in seconds) before retrying to scrub a previously snap-trimming PG
+ long_desc: Minimum delay after a failed attempt to scrub a PG that was performing
+ snap trimming and not available for scrubbing.
+ default: 10
+ min: 1
+ see_also:
+ - osd_scrub_retry_delay
+ with_legacy: false
+- name: osd_scrub_retry_new_interval
+ type: int
+ level: advanced
+ desc: Period (in seconds) before retrying a scrub aborted on a new interval
+ long_desc: Minimum delay before retrying, after a scrub was aborted as the
+ PG interval changed.
+ default: 10
+ min: 1
+ see_also:
+ - osd_scrub_retry_delay
+ with_legacy: false
- name: osd_scrub_disable_reservation_queuing
type: bool
level: advanced
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
index 56dcc05a54b..f3d242b1fe3 100644
--- a/src/common/options/rgw.yaml.in
+++ b/src/common/options/rgw.yaml.in
@@ -3458,11 +3458,11 @@ options:
- name: rgw_max_listing_results
type: uint
level: advanced
- desc: Upper bound on results in listing operations, ListBucket max-keys
+ desc: Upper bound on results in listing operations, ListObjects max-keys
long_desc: This caps the maximum permitted value for listing-like operations in
- RGW S3. Affects ListBucket(max-keys), ListBucketVersions(max-keys), ListBucketMultipartUploads(max-uploads),
- ListMultipartUploadParts(max-parts)
- default: 1000
+ RGW S3. Affects ListObjects(max-keys), ListObjectsVersions(max-keys),
+ ListMultipartUploads(max-uploads), ListParts(max-parts)
+ default: 5000
services:
- rgw
- rgw
diff --git a/src/crimson/common/gated.h b/src/crimson/common/gated.h
index 559a889a3e2..f7faef87970 100644
--- a/src/crimson/common/gated.h
+++ b/src/crimson/common/gated.h
@@ -6,6 +6,8 @@
#include <seastar/core/gate.hh>
#include <seastar/core/future.hh>
#include <seastar/core/future-util.hh>
+#include <type_traits>
+#include <vector>
#include "crimson/common/exception.h"
#include "crimson/common/log.h"
@@ -15,15 +17,27 @@ namespace crimson::common {
class Gated {
public:
+ Gated() : sid(seastar::this_shard_id()) {}
+ Gated(const seastar::shard_id sid) : sid(sid) {}
+ Gated(const Gated&) = delete;
+ Gated& operator=(const Gated&) = delete;
+ Gated(Gated&&) = default;
+ Gated& operator=(Gated&&) = delete;
+ virtual ~Gated() = default;
+
static seastar::logger& gated_logger() {
return crimson::get_logger(ceph_subsys_osd);
}
+
template <typename Func, typename T>
inline void dispatch_in_background(const char* what, T& who, Func&& func) {
- (void) dispatch(what, who, func);
+ //ceph_assert(seastar::this_shard_id() == sid);
+ (void) dispatch(what, who, std::forward<Func>(func));
}
+
template <typename Func, typename T>
inline seastar::future<> dispatch(const char* what, T& who, Func&& func) {
+ //ceph_assert(seastar::this_shard_id() == sid);
return seastar::with_gate(pending_dispatch, std::forward<Func>(func)
).handle_exception([what, &who] (std::exception_ptr eptr) {
if (*eptr.__cxa_exception_type() == typeid(system_shutdown_exception)) {
@@ -42,14 +56,81 @@ class Gated {
});
}
+ template <typename Func>
+ auto simple_dispatch(const char* what, Func&& func) {
+ //ceph_assert(seastar::this_shard_id() == sid);
+ return seastar::with_gate(pending_dispatch, std::forward<Func>(func));
+ }
+
seastar::future<> close() {
+ ceph_assert(seastar::this_shard_id() == sid);
return pending_dispatch.close();
}
+
bool is_closed() const {
return pending_dispatch.is_closed();
}
+
+ seastar::shard_id get_shard_id() const {
+ return sid;
+ }
private:
seastar::gate pending_dispatch;
+ const seastar::shard_id sid;
+};
+
+// gate_per_shard is a class that provides a gate for each shard.
+// It was introduced to provide a way to have gate for each shard
+// in a seastar application since gates are not supposed to be shared
+// across shards. ( https://tracker.ceph.com/issues/64332 )
+class gate_per_shard {
+ public:
+ gate_per_shard() : gates(seastar::smp::count) {
+ std::vector<seastar::future<>> futures;
+ for (unsigned shard = 0; shard < seastar::smp::count; ++shard) {
+ futures.push_back(seastar::smp::submit_to(shard, [this, shard] {
+ gates[shard] = std::make_unique<Gated>();
+ }));
+ }
+ seastar::when_all_succeed(futures.begin(), futures.end()).get();
+ }
+ //explicit gate_per_shard(size_t shard_count) : gates(shard_count) {}
+ gate_per_shard(const gate_per_shard&) = delete;
+ gate_per_shard& operator=(const gate_per_shard&) = delete;
+ gate_per_shard(gate_per_shard&&) = default;
+ gate_per_shard& operator=(gate_per_shard&&) = default;
+ ~gate_per_shard() = default;
+
+ template <typename Func, typename T>
+ inline void dispatch_in_background(const char* what, T& who, Func&& func) {
+ (void) dispatch(what, who, std::forward<Func>(func));
+ }
+
+ template <typename Func, typename T>
+ inline auto dispatch(const char* what, T& who, Func&& func) {
+ return gates[seastar::this_shard_id()]->dispatch(what, who, std::forward<Func>(func));
+ }
+
+ template <typename Func>
+ auto simple_dispatch(const char* what, Func&& func) {
+ return gates[seastar::this_shard_id()]->simple_dispatch(what, std::forward<Func>(func));
+ }
+
+ bool is_closed() const {
+ return gates[seastar::this_shard_id()]->is_closed();
+ }
+
+ seastar::future<> close_all() {
+ ceph_assert(gates.size() == seastar::smp::count);
+ return seastar::parallel_for_each(gates.begin(), gates.end(), [] (std::unique_ptr<Gated>& gate_ptr) {
+ return seastar::smp::submit_to(gate_ptr->get_shard_id(), [gate = gate_ptr.get()] {
+ return gate->close();
+ });
+ });
+ }
+
+ private:
+ std::vector<std::unique_ptr<Gated>> gates;
};
-}// namespace crimson::common
+} // namespace crimson::common
diff --git a/src/crimson/net/io_handler.h b/src/crimson/net/io_handler.h
index 80dd953c65f..5986fcb16ac 100644
--- a/src/crimson/net/io_handler.h
+++ b/src/crimson/net/io_handler.h
@@ -255,7 +255,7 @@ public:
class shard_states_t {
public:
shard_states_t(seastar::shard_id _sid, io_state_t state)
- : sid{_sid}, io_state{state} {}
+ : sid{_sid}, io_state{state}, gate{_sid} {}
seastar::shard_id get_shard_id() const {
return sid;
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
index 21bb250e13f..3fd2bb1fd15 100644
--- a/src/crimson/os/alienstore/alien_store.cc
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -75,7 +75,8 @@ AlienStore::AlienStore(const std::string& type,
const ConfigValues& values)
: type(type),
path{path},
- values(values)
+ values(values),
+ op_gates()
{
}
@@ -142,12 +143,12 @@ AlienStore::exists(
CollectionRef ch,
const ghobject_t& oid)
{
- return seastar::with_gate(op_gate, [=, this] {
- return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
- auto c = static_cast<AlienCollection*>(ch.get());
- return store->exists(c->collection, oid);
+ return op_gates.simple_dispatch("exists", [=, this] {
+ return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
+ auto c = static_cast<AlienCollection*>(ch.get());
+ return store->exists(c->collection, oid);
+ });
});
- });
}
AlienStore::mount_ertr::future<> AlienStore::mount()
@@ -173,7 +174,7 @@ seastar::future<> AlienStore::umount()
// not really started yet
return seastar::now();
}
- return op_gate.close().then([this] {
+ return op_gates.close_all().then([this] {
return tp->submit([this] {
{
std::lock_guard l(coll_map_lock);
@@ -183,10 +184,10 @@ seastar::future<> AlienStore::umount()
coll_map.clear();
}
return store->umount();
+ }).then([] (int r) {
+ assert(r == 0);
+ return seastar::now();
});
- }).then([] (int r) {
- assert(r == 0);
- return seastar::now();
});
}
@@ -477,7 +478,7 @@ seastar::future<> AlienStore::inject_data_error(const ghobject_t& o)
{
logger().debug("{}", __func__);
assert(tp);
- return seastar::with_gate(op_gate, [=, this] {
+ return op_gates.simple_dispatch("inject_data_error", [=, this] {
return tp->submit([o, this] {
return store->inject_data_error(o);
});
@@ -488,8 +489,8 @@ seastar::future<> AlienStore::inject_mdata_error(const ghobject_t& o)
{
logger().debug("{}", __func__);
assert(tp);
- return seastar::with_gate(op_gate, [=, this] {
- return tp->submit([=, this] {
+ return op_gates.simple_dispatch("inject_mdata_error", [=, this] {
+ return tp->submit([o, this] {
return store->inject_mdata_error(o);
});
});
@@ -500,7 +501,7 @@ seastar::future<> AlienStore::write_meta(const std::string& key,
{
logger().debug("{}", __func__);
assert(tp);
- return seastar::with_gate(op_gate, [=, this] {
+ return op_gates.simple_dispatch("write_meta", [=, this] {
return tp->submit([=, this] {
return store->write_meta(key, value);
}).then([] (int r) {
@@ -515,8 +516,8 @@ AlienStore::read_meta(const std::string& key)
{
logger().debug("{}", __func__);
assert(tp);
- return seastar::with_gate(op_gate, [this, key] {
- return tp->submit([this, key] {
+ return op_gates.simple_dispatch("read_meta", [this, key] {
+ return tp->submit([key, this] {
std::string value;
int r = store->read_meta(key, &value);
if (r > 0) {
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
index 734ee160959..d36f449afd8 100644
--- a/src/crimson/os/alienstore/alien_store.h
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -10,6 +10,7 @@
#include "os/ObjectStore.h"
#include "osd/osd_types.h"
+#include "crimson/common/gated.h"
#include "crimson/os/alienstore/thread_pool.h"
#include "crimson/os/futurized_collection.h"
#include "crimson/os/futurized_store.h"
@@ -111,9 +112,10 @@ public:
}
private:
+
template <class... Args>
auto do_with_op_gate(Args&&... args) const {
- return seastar::with_gate(op_gate,
+ return op_gates.simple_dispatch("AlienStore::do_with_op_gate",
// perfect forwarding in lambda's closure isn't available in C++17
// using tuple as workaround; see: https://stackoverflow.com/a/49902823
[args = std::make_tuple(std::forward<Args>(args)...)] () mutable {
@@ -130,7 +132,7 @@ private:
uint64_t used_bytes = 0;
std::unique_ptr<ObjectStore> store;
std::unique_ptr<CephContext> cct;
- mutable seastar::gate op_gate;
+ mutable crimson::common::gate_per_shard op_gates;
/**
* coll_map
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc
index 30ff4554074..f89698d602a 100644
--- a/src/crimson/os/seastore/backref/btree_backref_manager.cc
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc
@@ -38,12 +38,14 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
trans_intr::make_interruptible(
c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
} else {
+ c.cache.account_absent_access(c.trans.get_src());
return {false,
trans_intr::make_interruptible(
Cache::get_extent_ertr::make_ready_future<
CachedExtentRef>())};
}
} else {
+ c.cache.account_absent_access(c.trans.get_src());
return {false,
trans_intr::make_interruptible(
Cache::get_extent_ertr::make_ready_future<
diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h
index eacf8a8cc40..09f54a4f2d0 100644
--- a/src/crimson/os/seastore/btree/fixed_kv_node.h
+++ b/src/crimson/os/seastore/btree/fixed_kv_node.h
@@ -370,9 +370,11 @@ struct FixedKVNode : ChildableCachedExtent {
if (is_valid_child_ptr(child)) {
return c.cache.template get_extent_viewable_by_trans<T>(c.trans, (T*)child);
} else {
+ c.cache.account_absent_access(c.trans.get_src());
return child_pos_t(&sparent, spos);
}
} else {
+ c.cache.account_absent_access(c.trans.get_src());
return child_pos_t(this, pos);
}
}
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 56f7e3c76a9..cf8d3c0891d 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -52,10 +52,10 @@ Cache::Cache(
Cache::~Cache()
{
LOG_PREFIX(Cache::~Cache);
- for (auto &i: extents) {
+ for (auto &i: extents_index) {
ERROR("extent is still alive -- {}", i);
}
- ceph_assert(extents.empty());
+ ceph_assert(extents_index.empty());
}
// TODO: this method can probably be removed in the future
@@ -83,7 +83,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
// absent from transaction
// retiring is not included by the cache hit metrics
- ext = query_cache(addr, nullptr);
+ ext = query_cache(addr);
if (ext) {
DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext);
} else {
@@ -112,7 +112,7 @@ void Cache::retire_absent_extent_addr(
auto result = t.get_extent(addr, &ext);
assert(result != Transaction::get_extent_ret::PRESENT
&& result != Transaction::get_extent_ret::RETIRED);
- assert(!query_cache(addr, nullptr));
+ assert(!query_cache(addr));
#endif
LOG_PREFIX(Cache::retire_absent_extent_addr);
// add a new placeholder to Cache
@@ -134,7 +134,7 @@ void Cache::dump_contents()
{
LOG_PREFIX(Cache::dump_contents);
DEBUG("enter");
- for (auto &&i: extents) {
+ for (auto &&i: extents_index) {
DEBUG("live {}", i);
}
DEBUG("exit");
@@ -150,6 +150,9 @@ void Cache::register_metrics()
last_dirty_io_by_src_ext = {};
last_trim_rewrites = {};
last_reclaim_rewrites = {};
+ last_access = {};
+ last_cache_absent_by_src = {};
+ last_access_by_src_ext = {};
namespace sm = seastar::metrics;
using src_t = Transaction::src_t;
@@ -204,25 +207,25 @@ void Cache::register_metrics()
/*
* cache_query: cache_access and cache_hit
*/
- for (auto& [src, src_label] : labels_by_src) {
- metrics.add_group(
- "cache",
- {
- sm::make_counter(
- "cache_access",
- get_by_src(stats.cache_query_by_src, src).access,
- sm::description("total number of cache accesses"),
- {src_label}
- ),
- sm::make_counter(
- "cache_hit",
- get_by_src(stats.cache_query_by_src, src).hit,
- sm::description("total number of cache hits"),
- {src_label}
- ),
- }
- );
- }
+ metrics.add_group(
+ "cache",
+ {
+ sm::make_counter(
+ "cache_access",
+ [this] {
+ return stats.access.get_cache_access();
+ },
+ sm::description("total number of cache accesses")
+ ),
+ sm::make_counter(
+ "cache_hit",
+ [this] {
+ return stats.access.s.get_cache_hit();
+ },
+ sm::description("total number of cache hits")
+ ),
+ }
+ );
{
/*
@@ -487,14 +490,14 @@ void Cache::register_metrics()
sm::make_counter(
"cached_extents",
[this] {
- return extents.size();
+ return extents_index.size();
},
sm::description("total number of cached extents")
),
sm::make_counter(
"cached_extent_bytes",
[this] {
- return extents.get_bytes();
+ return extents_index.get_bytes();
},
sm::description("total bytes of cached extents")
),
@@ -738,7 +741,7 @@ void Cache::add_extent(CachedExtentRef ref)
assert(ref->is_valid());
assert(ref->user_hint == PLACEMENT_HINT_NULL);
assert(ref->rewrite_generation == NULL_GENERATION);
- extents.insert(*ref);
+ extents_index.insert(*ref);
}
void Cache::mark_dirty(CachedExtentRef ref)
@@ -876,7 +879,7 @@ void Cache::remove_extent(
} else if (!ref->is_placeholder()) {
lru.remove_from_lru(*ref);
}
- extents.erase(*ref);
+ extents_index.erase(*ref);
}
void Cache::commit_retire_extent(
@@ -897,7 +900,7 @@ void Cache::commit_replace_extent(
{
assert(next->get_paddr() == prev->get_paddr());
assert(next->version == prev->version + 1);
- extents.replace(*next, *prev);
+ extents_index.replace(*next, *prev);
const auto t_src = t.get_src();
if (is_root_type(prev->get_type())) {
@@ -1822,7 +1825,7 @@ void Cache::init()
NULL_GENERATION,
TRANS_ID_NULL);
INFO("init root -- {}", *root);
- extents.insert(*root);
+ extents_index.insert(*root);
}
Cache::mkfs_iertr::future<> Cache::mkfs(Transaction &t)
@@ -1851,8 +1854,8 @@ Cache::close_ertr::future<> Cache::close()
get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL),
lru.get_current_num_extents(),
lru.get_current_size_bytes(),
- extents.size(),
- extents.get_bytes());
+ extents_index.size(),
+ extents_index.get_bytes());
root.reset();
clear_dirty();
backref_extents.clear();
@@ -1970,7 +1973,7 @@ Cache::replay_delta(
auto _get_extent_if_cached = [this](paddr_t addr)
-> get_extent_ertr::future<CachedExtentRef> {
// replay is not included by the cache hit metrics
- auto ret = query_cache(addr, nullptr);
+ auto ret = query_cache(addr);
if (ret) {
// no retired-placeholder should be exist yet because no transaction
// has been created.
@@ -1983,15 +1986,14 @@ Cache::replay_delta(
}
};
auto extent_fut = (delta.pversion == 0 ?
- // replay is not included by the cache hit metrics
do_get_caching_extent_by_type(
delta.type,
delta.paddr,
delta.laddr,
delta.length,
- nullptr,
[](CachedExtent &) {},
[this](CachedExtent &ext) {
+ // replay is not included by the cache hit metrics
touch_extent(ext, nullptr);
}) :
_get_extent_if_cached(
@@ -2155,73 +2157,65 @@ Cache::do_get_caching_extent_by_type(
paddr_t offset,
laddr_t laddr,
extent_len_t length,
- const Transaction::src_t* p_src,
extent_init_func_t &&extent_init_func,
extent_init_func_t &&on_cache)
{
return [=, this, extent_init_func=std::move(extent_init_func)]() mutable {
- src_ext_t* p_metric_key = nullptr;
- src_ext_t metric_key;
- if (p_src) {
- metric_key = std::make_pair(*p_src, type);
- p_metric_key = &metric_key;
- }
-
switch (type) {
case extent_types_t::ROOT:
ceph_assert(0 == "ROOT is never directly read");
return get_extent_ertr::make_ready_future<CachedExtentRef>();
case extent_types_t::BACKREF_INTERNAL:
return do_get_caching_extent<backref::BackrefInternalNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::BACKREF_LEAF:
return do_get_caching_extent<backref::BackrefLeafNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::LADDR_INTERNAL:
return do_get_caching_extent<lba_manager::btree::LBAInternalNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::LADDR_LEAF:
return do_get_caching_extent<lba_manager::btree::LBALeafNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OMAP_INNER:
return do_get_caching_extent<omap_manager::OMapInnerNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OMAP_LEAF:
return do_get_caching_extent<omap_manager::OMapLeafNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::COLL_BLOCK:
return do_get_caching_extent<collection_manager::CollectionNode>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::ONODE_BLOCK_STAGED:
return do_get_caching_extent<onode::SeastoreNodeExtent>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OBJECT_DATA_BLOCK:
return do_get_caching_extent<ObjectDataBlock>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
@@ -2230,13 +2224,13 @@ Cache::do_get_caching_extent_by_type(
return get_extent_ertr::make_ready_future<CachedExtentRef>();
case extent_types_t::TEST_BLOCK:
return do_get_caching_extent<TestBlock>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::TEST_BLOCK_PHYSICAL:
return do_get_caching_extent<TestBlockPhysical>(
- offset, length, p_metric_key, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache)
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
@@ -2266,12 +2260,17 @@ cache_stats_t Cache::get_stats(
lru.get_stats(ret, report_detail, seconds);
/*
- * get dirty stats
+ * dirty stats
+ * rewrite stats
+ * index stats
+ * access stats
*/
ret.dirty_sizes = cache_size_stats_t{stats.dirty_bytes, dirty.size()};
ret.dirty_io = stats.dirty_io;
ret.dirty_io.minus(last_dirty_io);
+ ret.access = stats.access;
+ ret.access.minus(last_access);
if (report_detail && seconds != 0) {
counter_by_src_t<counter_by_extent_t<dirty_io_stats_t> >
@@ -2326,7 +2325,7 @@ cache_stats_t Cache::get_stats(
const auto& io_by_ext = get_by_src(_trans_io_by_src_ext, src);
for (uint8_t _ext=0; _ext<EXTENT_TYPES_MAX; ++_ext) {
auto ext = static_cast<extent_types_t>(_ext);
- const auto extent_io = get_by_ext(io_by_ext, ext);
+ const auto& extent_io = get_by_ext(io_by_ext, ext);
if (is_data_type(ext)) {
data_io.add(extent_io);
} else if (is_logical_metadata_type(ext)) {
@@ -2363,14 +2362,71 @@ cache_stats_t Cache::get_stats(
<< "ps, dversion="
<< fmt::format(dfmt, _reclaim_rewrites.get_avg_version());
+ oss << "\ncache total"
+ << cache_size_stats_t{extents_index.get_bytes(), extents_index.size()};
+
+ counter_by_src_t<counter_by_extent_t<extent_access_stats_t> >
+ _access_by_src_ext = stats.access_by_src_ext;
+ counter_by_src_t<cache_access_stats_t> access_by_src;
+ for (uint8_t _src=0; _src<TRANSACTION_TYPE_MAX; ++_src) {
+ auto src = static_cast<transaction_type_t>(_src);
+ cache_access_stats_t& trans_access = get_by_src(access_by_src, src);
+ trans_access.cache_absent = get_by_src(stats.cache_absent_by_src, src);
+ trans_access.cache_absent -= get_by_src(last_cache_absent_by_src, src);
+ auto& access_by_ext = get_by_src(_access_by_src_ext, src);
+ const auto& last_access_by_ext = get_by_src(last_access_by_src_ext, src);
+ for (uint8_t _ext=0; _ext<EXTENT_TYPES_MAX; ++_ext) {
+ auto ext = static_cast<extent_types_t>(_ext);
+ extent_access_stats_t& extent_access = get_by_ext(access_by_ext, ext);
+ const auto& last_extent_access = get_by_ext(last_access_by_ext, ext);
+ extent_access.minus(last_extent_access);
+ trans_access.s.add(extent_access);
+ }
+ }
+ oss << "\naccess: total"
+ << cache_access_stats_printer_t{seconds, ret.access};
+ for (uint8_t _src=0; _src<TRANSACTION_TYPE_MAX; ++_src) {
+ auto src = static_cast<transaction_type_t>(_src);
+ const auto& trans_access = get_by_src(access_by_src, src);
+ if (trans_access.is_empty()) {
+ continue;
+ }
+ extent_access_stats_t data_access;
+ extent_access_stats_t mdat_access;
+ extent_access_stats_t phys_access;
+ const auto& access_by_ext = get_by_src(_access_by_src_ext, src);
+ for (uint8_t _ext=0; _ext<EXTENT_TYPES_MAX; ++_ext) {
+ auto ext = static_cast<extent_types_t>(_ext);
+ const auto& extent_access = get_by_ext(access_by_ext, ext);
+ if (is_data_type(ext)) {
+ data_access.add(extent_access);
+ } else if (is_logical_metadata_type(ext)) {
+ mdat_access.add(extent_access);
+ } else if (is_physical_type(ext)) {
+ phys_access.add(extent_access);
+ }
+ }
+ oss << "\n " << src << ": "
+ << cache_access_stats_printer_t{seconds, trans_access}
+ << "\n data"
+ << extent_access_stats_printer_t{seconds, data_access}
+ << "\n mdat"
+ << extent_access_stats_printer_t{seconds, mdat_access}
+ << "\n phys"
+ << extent_access_stats_printer_t{seconds, phys_access};
+ }
+
INFO("{}", oss.str());
last_dirty_io_by_src_ext = stats.dirty_io_by_src_ext;
last_trim_rewrites = stats.trim_rewrites;
last_reclaim_rewrites = stats.reclaim_rewrites;
+ last_cache_absent_by_src = stats.cache_absent_by_src;
+ last_access_by_src_ext = stats.access_by_src_ext;
}
last_dirty_io = stats.dirty_io;
+ last_access = stats.access;
return ret;
}
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index 7bfc4d07aab..dba3610e95f 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -167,7 +167,7 @@ using backref_entry_query_set_t = std::set<
* - Remove all extents in the retired_set from Cache::extents
* - Mark all extents in the write_set wait_io(), add promises to
* transaction
- * - Merge Transaction::write_set into Cache::extents
+ * - Merge Transaction::write_set into Cache::extents_index
*
* After phase 2, the user will submit the record to the journal.
* Once complete, we perform phase 3:
@@ -272,6 +272,11 @@ public:
return t.root;
}
+ void account_absent_access(Transaction::src_t src) {
+ ++(get_by_src(stats.cache_absent_by_src, src));
+ ++stats.access.cache_absent;
+ }
+
/**
* get_extent_if_cached
*
@@ -287,12 +292,29 @@ public:
CachedExtentRef ret;
LOG_PREFIX(Cache::get_extent_if_cached);
auto result = t.get_extent(offset, &ret);
+ const auto t_src = t.get_src();
+ extent_access_stats_t& access_stats = get_by_ext(
+ get_by_src(stats.access_by_src_ext, t_src),
+ type);
if (result == Transaction::get_extent_ret::RETIRED) {
SUBDEBUGT(seastore_cache, "{} {} is retired on t -- {}",
t, type, offset, *ret);
return get_extent_if_cached_iertr::make_ready_future<
CachedExtentRef>(ret);
} else if (result == Transaction::get_extent_ret::PRESENT) {
+ if (ret->is_stable()) {
+ if (ret->is_dirty()) {
+ ++access_stats.trans_dirty;
+ ++stats.access.s.trans_dirty;
+ } else {
+ ++access_stats.trans_lru;
+ ++stats.access.s.trans_lru;
+ }
+ } else {
+ ++access_stats.trans_pending;
+ ++stats.access.s.trans_pending;
+ }
+
if (ret->is_fully_loaded()) {
SUBTRACET(seastore_cache, "{} {} is present on t -- {}",
t, type, offset, *ret);
@@ -309,17 +331,29 @@ public:
}
// get_extent_ret::ABSENT from transaction
- auto metric_key = std::make_pair(t.get_src(), type);
- ret = query_cache(offset, &metric_key);
+ ret = query_cache(offset);
if (!ret) {
SUBDEBUGT(seastore_cache, "{} {} is absent", t, type, offset);
+ account_absent_access(t_src);
return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
} else if (is_retired_placeholder_type(ret->get_type())) {
// retired_placeholder is not really cached yet
SUBDEBUGT(seastore_cache, "{} {} is absent(placeholder)",
t, type, offset);
+ account_absent_access(t_src);
return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
- } else if (!ret->is_fully_loaded()) {
+ }
+
+ if (ret->is_dirty()) {
+ ++access_stats.cache_dirty;
+ ++stats.access.s.cache_dirty;
+ } else {
+ ++access_stats.cache_lru;
+ ++stats.access.s.cache_lru;
+ }
+
+ if (!ret->is_fully_loaded()) {
+ // ignore non-full extent
SUBDEBUGT(seastore_cache, "{} {} is present without "
"being fully loaded", t, type, offset);
return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
@@ -329,7 +363,6 @@ public:
SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
t, type, offset, *ret);
t.add_to_read_set(ret);
- const auto t_src = t.get_src();
touch_extent(*ret, &t_src);
return ret->wait_io().then([ret] {
return get_extent_if_cached_iertr::make_ready_future<
@@ -349,6 +382,8 @@ public:
*
* Note, the current implementation leverages parent-child
* pointers in LBA instead, so it should only be called in tests.
+ *
+ * This path won't be accounted by the cache_access_stats_t.
*/
using get_extent_iertr = base_iertr;
template <typename T>
@@ -389,11 +424,9 @@ public:
const auto t_src = t.get_src();
touch_extent(ext, &t_src);
};
- auto metric_key = std::make_pair(t.get_src(), T::TYPE);
return trans_intr::make_interruptible(
do_get_caching_extent<T>(
- offset, length, &metric_key,
- [](T &){}, std::move(f))
+ offset, length, [](T &){}, std::move(f))
);
}
}
@@ -423,15 +456,22 @@ public:
SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
t, T::TYPE, offset, length);
auto f = [&t, this](CachedExtent &ext) {
- t.add_to_read_set(CachedExtentRef(&ext));
+ // FIXME: assert(ext.is_stable_clean());
+ assert(ext.is_stable());
+ assert(T::TYPE == ext.get_type());
const auto t_src = t.get_src();
+ extent_access_stats_t& access_stats = get_by_ext(
+ get_by_src(stats.access_by_src_ext, t_src),
+ T::TYPE);
+ ++access_stats.load_absent;
+ ++stats.access.s.load_absent;
+
+ t.add_to_read_set(CachedExtentRef(&ext));
touch_extent(ext, &t_src);
};
- auto metric_key = std::make_pair(t.get_src(), T::TYPE);
return trans_intr::make_interruptible(
do_get_caching_extent<T>(
- offset, length, &metric_key,
- std::forward<Func>(extent_init_func), std::move(f))
+ offset, length, std::forward<Func>(extent_init_func), std::move(f))
);
}
@@ -480,6 +520,13 @@ public:
CachedExtentRef extent)
{
assert(extent->is_valid());
+
+ const auto t_src = t.get_src();
+ auto ext_type = extent->get_type();
+ extent_access_stats_t& access_stats = get_by_ext(
+ get_by_src(stats.access_by_src_ext, t_src),
+ ext_type);
+
CachedExtent* p_extent;
if (extent->is_stable()) {
p_extent = extent->get_transactional_view(t);
@@ -487,6 +534,8 @@ public:
assert(!extent->is_stable_writting());
assert(p_extent->is_pending_in_trans(t.get_trans_id()));
assert(!p_extent->is_stable_writting());
+ ++access_stats.trans_pending;
+ ++stats.access.s.trans_pending;
if (p_extent->is_mutable()) {
assert(p_extent->is_fully_loaded());
assert(!p_extent->is_pending_io());
@@ -499,13 +548,29 @@ public:
// stable from trans-view
assert(!p_extent->is_pending_in_trans(t.get_trans_id()));
if (t.maybe_add_to_read_set(p_extent)) {
- const auto t_src = t.get_src();
+ if (p_extent->is_dirty()) {
+ ++access_stats.cache_dirty;
+ ++stats.access.s.cache_dirty;
+ } else {
+ ++access_stats.cache_lru;
+ ++stats.access.s.cache_lru;
+ }
touch_extent(*p_extent, &t_src);
+ } else {
+ if (p_extent->is_dirty()) {
+ ++access_stats.trans_dirty;
+ ++stats.access.s.trans_dirty;
+ } else {
+ ++access_stats.trans_lru;
+ ++stats.access.s.trans_lru;
+ }
}
}
} else {
assert(!extent->is_stable_writting());
assert(extent->is_pending_in_trans(t.get_trans_id()));
+ ++access_stats.trans_pending;
+ ++stats.access.s.trans_pending;
if (extent->is_mutable()) {
assert(extent->is_fully_loaded());
assert(!extent->is_pending_io());
@@ -520,6 +585,8 @@ public:
ceph_assert(!is_retired_placeholder_type(p_extent->get_type()));
if (!p_extent->is_fully_loaded()) {
assert(!p_extent->is_mutable());
+ ++access_stats.load_present;
+ ++stats.access.s.load_present;
LOG_PREFIX(Cache::get_extent_viewable_by_trans);
SUBDEBUG(seastore_cache,
"{} {}~{} is present without been fully loaded, reading ... -- {}",
@@ -557,7 +624,7 @@ public:
// Interfaces only for tests.
public:
CachedExtentRef test_query_cache(paddr_t offset) {
- return query_cache(offset, nullptr);
+ return query_cache(offset);
}
private:
@@ -573,12 +640,11 @@ private:
read_extent_ret<T> do_get_caching_extent(
paddr_t offset, ///< [in] starting addr
extent_len_t length, ///< [in] length
- const src_ext_t* p_src_ext, ///< [in] cache query metric key
Func &&extent_init_func, ///< [in] init func for extent
OnCache &&on_cache
) {
LOG_PREFIX(Cache::do_get_caching_extent);
- auto cached = query_cache(offset, p_src_ext);
+ auto cached = query_cache(offset);
if (!cached) {
auto ret = CachedExtent::make_cached_extent_ref<T>(
alloc_cache_buf(length));
@@ -610,7 +676,7 @@ private:
SUBDEBUG(seastore_cache,
"{} {}~{} is absent(placeholder), reading ... -- {}",
T::TYPE, offset, length, *ret);
- extents.replace(*ret, *cached);
+ extents_index.replace(*ret, *cached);
on_cache(*ret);
// replace placeholder in transactions
@@ -684,11 +750,18 @@ private:
paddr_t offset,
laddr_t laddr,
extent_len_t length,
- const Transaction::src_t* p_src,
extent_init_func_t &&extent_init_func,
extent_init_func_t &&on_cache
);
+ /**
+ * get_caching_extent_by_type
+ *
+ * Note, the current implementation leverages parent-child
+ * pointers in LBA instead, so it should only be called in tests.
+ *
+ * This path won't be accounted by the cache_access_stats_t.
+ */
using get_extent_by_type_iertr = get_extent_iertr;
using get_extent_by_type_ret = get_extent_by_type_iertr::future<
CachedExtentRef>;
@@ -731,10 +804,9 @@ private:
const auto t_src = t.get_src();
touch_extent(ext, &t_src);
};
- auto src = t.get_src();
return trans_intr::make_interruptible(
do_get_caching_extent_by_type(
- type, offset, laddr, length, &src,
+ type, offset, laddr, length,
std::move(extent_init_func), std::move(f))
);
}
@@ -762,14 +834,21 @@ private:
SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
t, type, offset, length, laddr);
auto f = [&t, this](CachedExtent &ext) {
- t.add_to_read_set(CachedExtentRef(&ext));
+ // FIXME: assert(ext.is_stable_clean());
+ assert(ext.is_stable());
const auto t_src = t.get_src();
+ extent_access_stats_t& access_stats = get_by_ext(
+ get_by_src(stats.access_by_src_ext, t_src),
+ ext.get_type());
+ ++access_stats.load_absent;
+ ++stats.access.s.load_absent;
+
+ t.add_to_read_set(CachedExtentRef(&ext));
touch_extent(ext, &t_src);
};
- auto src = t.get_src();
return trans_intr::make_interruptible(
do_get_caching_extent_by_type(
- type, offset, laddr, length, &src,
+ type, offset, laddr, length,
std::move(extent_init_func), std::move(f))
);
}
@@ -1141,8 +1220,8 @@ public:
SUBINFOT(seastore_cache,
"start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
t,
- extents.size(),
- extents.get_bytes(),
+ extents_index.size(),
+ extents_index.get_bytes(),
dirty.size(),
get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
@@ -1151,7 +1230,7 @@ public:
// Cache::root should have been inserted to the dirty list
assert(root->is_dirty());
std::vector<CachedExtentRef> _dirty;
- for (auto &e : extents) {
+ for (auto &e : extents_index) {
_dirty.push_back(CachedExtentRef(&e));
}
return seastar::do_with(
@@ -1184,8 +1263,8 @@ public:
SUBINFOT(seastore_cache,
"finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
t,
- extents.size(),
- extents.get_bytes(),
+ extents_index.size(),
+ extents_index.get_bytes(),
dirty.size(),
get_oldest_dirty_from().value_or(JOURNAL_SEQ_NULL),
get_oldest_backref_dirty_from().value_or(JOURNAL_SEQ_NULL));
@@ -1370,7 +1449,7 @@ private:
ExtentPlacementManager& epm;
RootBlockRef root; ///< ref to current root
- ExtentIndex extents; ///< set of live extents
+ ExtentIndex extents_index; ///< set of live extents
journal_seq_t last_commit = JOURNAL_SEQ_MIN;
@@ -1598,7 +1677,6 @@ private:
counter_by_src_t<uint64_t> trans_created_by_src;
counter_by_src_t<commit_trans_efforts_t> committed_efforts_by_src;
counter_by_src_t<invalid_trans_efforts_t> invalidated_efforts_by_src;
- counter_by_src_t<query_counters_t> cache_query_by_src;
success_read_trans_efforts_t success_read_efforts;
uint64_t dirty_bytes = 0;
@@ -1607,6 +1685,11 @@ private:
counter_by_src_t<counter_by_extent_t<dirty_io_stats_t> >
dirty_io_by_src_ext;
+ cache_access_stats_t access;
+ counter_by_src_t<uint64_t> cache_absent_by_src;
+ counter_by_src_t<counter_by_extent_t<extent_access_stats_t> >
+ access_by_src_ext;
+
uint64_t onode_tree_depth = 0;
int64_t onode_tree_extents_num = 0;
counter_by_src_t<tree_efforts_t> committed_onode_tree_efforts;
@@ -1639,6 +1722,10 @@ private:
last_dirty_io_by_src_ext;
mutable rewrite_stats_t last_trim_rewrites;
mutable rewrite_stats_t last_reclaim_rewrites;
+ mutable cache_access_stats_t last_access;
+ mutable counter_by_src_t<uint64_t> last_cache_absent_by_src;
+ mutable counter_by_src_t<counter_by_extent_t<extent_access_stats_t> >
+ last_access_by_src_ext;
void account_conflict(Transaction::src_t src1, Transaction::src_t src2) {
assert(src1 < Transaction::src_t::MAX);
@@ -1776,21 +1863,10 @@ private:
}
// Extents in cache may contain placeholders
- CachedExtentRef query_cache(
- paddr_t offset,
- const src_ext_t* p_metric_key) {
- query_counters_t* p_counters = nullptr;
- if (p_metric_key) {
- p_counters = &get_by_src(stats.cache_query_by_src, p_metric_key->first);
- ++p_counters->access;
- }
- if (auto iter = extents.find_offset(offset);
- iter != extents.end()) {
- if (p_metric_key &&
- // retired_placeholder is not really cached yet
- !is_retired_placeholder_type(iter->get_type())) {
- ++p_counters->hit;
- }
+ CachedExtentRef query_cache(paddr_t offset) {
+ if (auto iter = extents_index.find_offset(offset);
+ iter != extents_index.end()) {
+ assert(iter->is_stable());
return CachedExtentRef(&*iter);
} else {
return CachedExtentRef();
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 96cf91b1d43..6c5c6c6fcc2 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -786,7 +786,7 @@ protected:
struct retired_placeholder_t{};
CachedExtent(retired_placeholder_t, extent_len_t _length)
- : state(extent_state_t::INVALID),
+ : state(extent_state_t::CLEAN),
length(_length) {
assert(length > 0);
}
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
index 8439b733704..b7a1d8f8ba9 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -62,12 +62,14 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
trans_intr::make_interruptible(
c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
} else {
+ c.cache.account_absent_access(c.trans.get_src());
return {false,
trans_intr::make_interruptible(
Cache::get_extent_ertr::make_ready_future<
CachedExtentRef>())};
}
} else {
+ c.cache.account_absent_access(c.trans.get_src());
return {false,
trans_intr::make_interruptible(
Cache::get_extent_ertr::make_ready_future<
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index 9a0ac439083..15774332373 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -711,13 +711,9 @@ seastar::future<> SeaStore::report_stats()
}
cache_size_stats_t lru_sizes_ps = cache_total.lru_sizes;
- lru_sizes_ps.size /= seastar::smp::count;
- lru_sizes_ps.num_extents /= seastar::smp::count;
+ lru_sizes_ps.divide_by(seastar::smp::count);
cache_io_stats_t lru_io_ps = cache_total.lru_io;
- lru_io_ps.in_sizes.size /= seastar::smp::count;
- lru_io_ps.in_sizes.num_extents /= seastar::smp::count;
- lru_io_ps.out_sizes.size /= seastar::smp::count;
- lru_io_ps.out_sizes.num_extents /= seastar::smp::count;
+ lru_io_ps.divide_by(seastar::smp::count);
INFO("cache lru: total{} {}; per-shard: total{} {}",
cache_total.lru_sizes,
cache_io_stats_printer_t{seconds, cache_total.lru_io},
@@ -725,21 +721,21 @@ seastar::future<> SeaStore::report_stats()
cache_io_stats_printer_t{seconds, lru_io_ps});
cache_size_stats_t dirty_sizes_ps = cache_total.dirty_sizes;
- dirty_sizes_ps.size /= seastar::smp::count;
- dirty_sizes_ps.num_extents /= seastar::smp::count;
+ dirty_sizes_ps.divide_by(seastar::smp::count);
dirty_io_stats_t dirty_io_ps = cache_total.dirty_io;
- dirty_io_ps.in_sizes.size /= seastar::smp::count;
- dirty_io_ps.in_sizes.num_extents /= seastar::smp::count;
- dirty_io_ps.num_replace /= seastar::smp::count;
- dirty_io_ps.out_sizes.size /= seastar::smp::count;
- dirty_io_ps.out_sizes.num_extents /= seastar::smp::count;
- dirty_io_ps.out_versions /= seastar::smp::count;
+ dirty_io_ps.divide_by(seastar::smp::count);
INFO("cache dirty: total{} {}; per-shard: total{} {}",
cache_total.dirty_sizes,
dirty_io_stats_printer_t{seconds, cache_total.dirty_io},
dirty_sizes_ps,
dirty_io_stats_printer_t{seconds, dirty_io_ps});
+ cache_access_stats_t access_ps = cache_total.access;
+ access_ps.divide_by(seastar::smp::count);
+ INFO("cache_access: total{}; per-shard{}",
+ cache_access_stats_printer_t{seconds, cache_total.access},
+ cache_access_stats_printer_t{seconds, access_ps});
+
return seastar::now();
});
}
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index 13637a59db6..e1430b30019 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -1038,4 +1038,78 @@ std::ostream& operator<<(std::ostream& out, const dirty_io_stats_printer_t& p)
return out;
}
+std::ostream& operator<<(std::ostream& out, const extent_access_stats_printer_t& p)
+{
+ constexpr const char* dfmt = "{:.2f}";
+ double est_total_access = static_cast<double>(p.stats.get_estimated_total_access());
+ out << "(~";
+ if (est_total_access > 1000000) {
+ out << fmt::format(dfmt, est_total_access/1000000)
+ << "M, ";
+ } else {
+ out << fmt::format(dfmt, est_total_access/1000)
+ << "K, ";
+ }
+ double trans_hit = static_cast<double>(p.stats.get_trans_hit());
+ double cache_hit = static_cast<double>(p.stats.get_cache_hit());
+ double est_cache_access = static_cast<double>(p.stats.get_estimated_cache_access());
+ double load_absent = static_cast<double>(p.stats.load_absent);
+ out << "trans-hit=~"
+ << fmt::format(dfmt, trans_hit/est_total_access*100)
+ << "%(p"
+ << fmt::format(dfmt, p.stats.trans_pending/trans_hit)
+ << ",d"
+ << fmt::format(dfmt, p.stats.trans_dirty/trans_hit)
+ << ",l"
+ << fmt::format(dfmt, p.stats.trans_lru/trans_hit)
+ << "), cache-hit=~"
+ << fmt::format(dfmt, cache_hit/est_cache_access*100)
+ << "%(d"
+ << fmt::format(dfmt, p.stats.cache_dirty/cache_hit)
+ << ",l"
+ << fmt::format(dfmt, p.stats.cache_lru/cache_hit)
+ <<"), load-present/absent="
+ << fmt::format(dfmt, p.stats.load_present/load_absent)
+ << ")";
+ return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const cache_access_stats_printer_t& p)
+{
+ constexpr const char* dfmt = "{:.2f}";
+ double total_access = static_cast<double>(p.stats.get_total_access());
+ out << "(";
+ if (total_access > 1000000) {
+ out << fmt::format(dfmt, total_access/1000000)
+ << "M, ";
+ } else {
+ out << fmt::format(dfmt, total_access/1000)
+ << "K, ";
+ }
+ double trans_hit = static_cast<double>(p.stats.s.get_trans_hit());
+ double cache_hit = static_cast<double>(p.stats.s.get_cache_hit());
+ double cache_access = static_cast<double>(p.stats.get_cache_access());
+ double load_absent = static_cast<double>(p.stats.s.load_absent);
+ out << "trans-hit="
+ << fmt::format(dfmt, trans_hit/total_access*100)
+ << "%(p"
+ << fmt::format(dfmt, p.stats.s.trans_pending/trans_hit)
+ << ",d"
+ << fmt::format(dfmt, p.stats.s.trans_dirty/trans_hit)
+ << ",l"
+ << fmt::format(dfmt, p.stats.s.trans_lru/trans_hit)
+ << "), cache-hit="
+ << fmt::format(dfmt, cache_hit/cache_access*100)
+ << "%(d"
+ << fmt::format(dfmt, p.stats.s.cache_dirty/cache_hit)
+ << ",l"
+ << fmt::format(dfmt, p.stats.s.cache_lru/cache_hit)
+ <<"), load/absent="
+ << fmt::format(dfmt, load_absent/p.stats.cache_absent*100)
+ << "%, load-present/absent="
+ << fmt::format(dfmt, p.stats.s.load_present/load_absent)
+ << ")";
+ return out;
+}
+
} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index 5cdb1f86142..df5c184e7ab 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -2800,6 +2800,11 @@ struct cache_size_stats_t {
size -= o.size;
num_extents -= o.num_extents;
}
+
+ void divide_by(unsigned d) {
+ size /= d;
+ num_extents /= d;
+ }
};
std::ostream& operator<<(std::ostream&, const cache_size_stats_t&);
struct cache_size_stats_printer_t {
@@ -2825,6 +2830,11 @@ struct cache_io_stats_t {
in_sizes.minus(o.in_sizes);
out_sizes.minus(o.out_sizes);
}
+
+ void divide_by(unsigned d) {
+ in_sizes.divide_by(d);
+ out_sizes.divide_by(d);
+ }
};
struct cache_io_stats_printer_t {
double seconds;
@@ -2861,6 +2871,13 @@ struct dirty_io_stats_t {
out_sizes.minus(o.out_sizes);
out_versions -= o.out_versions;
}
+
+ void divide_by(unsigned d) {
+ in_sizes.divide_by(d);
+ num_replace /= d;
+ out_sizes.divide_by(d);
+ out_versions /= d;
+ }
};
struct dirty_io_stats_printer_t {
double seconds;
@@ -2868,17 +2885,130 @@ struct dirty_io_stats_printer_t {
};
std::ostream& operator<<(std::ostream&, const dirty_io_stats_printer_t&);
+/*
+ * Doesn't account:
+ * replay
+ * rewrite
+ * retiring/placeholder
+ * get_caching_extent() -- test only
+ * get_caching_extent_by_type() -- test only
+ */
+struct extent_access_stats_t {
+ uint64_t trans_pending = 0;
+ uint64_t trans_dirty = 0;
+ uint64_t trans_lru = 0;
+ uint64_t cache_dirty = 0;
+ uint64_t cache_lru = 0;
+
+ uint64_t load_absent = 0;
+ uint64_t load_present = 0;
+
+ uint64_t get_trans_hit() const {
+ return trans_pending + trans_dirty + trans_lru;
+ }
+
+ uint64_t get_cache_hit() const {
+ return cache_dirty + cache_lru;
+ }
+
+ uint64_t get_estimated_cache_access() const {
+ return get_cache_hit() + load_absent;
+ }
+
+ uint64_t get_estimated_total_access() const {
+ return get_trans_hit() + get_cache_hit() + load_absent;
+ }
+
+ bool is_empty() const {
+ return get_estimated_total_access() == 0;
+ }
+
+ void add(const extent_access_stats_t& o) {
+ trans_pending += o.trans_pending;
+ trans_dirty += o.trans_dirty;
+ trans_lru += o.trans_lru;
+ cache_dirty += o.cache_dirty;
+ cache_lru += o.cache_lru;
+ load_absent += o.load_absent;
+ load_present += o.load_present;
+ }
+
+ void minus(const extent_access_stats_t& o) {
+ trans_pending -= o.trans_pending;
+ trans_dirty -= o.trans_dirty;
+ trans_lru -= o.trans_lru;
+ cache_dirty -= o.cache_dirty;
+ cache_lru -= o.cache_lru;
+ load_absent -= o.load_absent;
+ load_present -= o.load_present;
+ }
+
+ void divide_by(unsigned d) {
+ trans_pending /= d;
+ trans_dirty /= d;
+ trans_lru /= d;
+ cache_dirty /= d;
+ cache_lru /= d;
+ load_absent /= d;
+ load_present /= d;
+ }
+};
+struct extent_access_stats_printer_t {
+ double seconds;
+ const extent_access_stats_t& stats;
+};
+std::ostream& operator<<(std::ostream&, const extent_access_stats_printer_t&);
+
+struct cache_access_stats_t {
+ extent_access_stats_t s;
+ uint64_t cache_absent = 0;
+
+ uint64_t get_cache_access() const {
+ return s.get_cache_hit() + cache_absent;
+ }
+
+ uint64_t get_total_access() const {
+ return s.get_trans_hit() + get_cache_access();
+ }
+
+ bool is_empty() const {
+ return get_total_access() == 0;
+ }
+
+ void add(const cache_access_stats_t& o) {
+ s.add(o.s);
+ cache_absent += o.cache_absent;
+ }
+
+ void minus(const cache_access_stats_t& o) {
+ s.minus(o.s);
+ cache_absent -= o.cache_absent;
+ }
+
+ void divide_by(unsigned d) {
+ s.divide_by(d);
+ cache_absent /= d;
+ }
+};
+struct cache_access_stats_printer_t {
+ double seconds;
+ const cache_access_stats_t& stats;
+};
+std::ostream& operator<<(std::ostream&, const cache_access_stats_printer_t&);
+
struct cache_stats_t {
cache_size_stats_t lru_sizes;
cache_io_stats_t lru_io;
cache_size_stats_t dirty_sizes;
dirty_io_stats_t dirty_io;
+ cache_access_stats_t access;
void add(const cache_stats_t& o) {
lru_sizes.add(o.lru_sizes);
lru_io.add(o.lru_io);
dirty_sizes.add(o.dirty_sizes);
dirty_io.add(o.dirty_io);
+ access.add(o.access);
}
};
@@ -2900,6 +3030,7 @@ WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::alloc_delta_t)
WRITE_CLASS_DENC_BOUNDED(crimson::os::seastore::segment_tail_t)
#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::cache_access_stats_printer_t> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::cache_io_stats_printer_t> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::cache_size_stats_t> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::cache_size_stats_printer_t> : fmt::ostream_formatter {};
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index 23636d79d88..828b8a25592 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -508,9 +508,11 @@ public:
auto ret = get_extent_if_linked<T>(t, pin->duplicate());
if (ret.index() == 1) {
return std::move(std::get<1>(ret));
+ } else {
+ // absent
+ return base_iertr::make_ready_future<TCachedExtentRef<T>>();
}
}
- return base_iertr::make_ready_future<TCachedExtentRef<T>>();
}).si_then([this, &t, &remaps, original_paddr,
original_laddr, original_len,
&extents, FNAME](auto ext) mutable {
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
index 26f7e5dedde..32eaaf02b3f 100644
--- a/src/crimson/osd/ec_backend.cc
+++ b/src/crimson/osd/ec_backend.cc
@@ -32,6 +32,6 @@ ECBackend::submit_transaction(const std::set<pg_shard_t> &pg_shards,
std::vector<pg_log_entry_t>&& log_entries)
{
// todo
- return {seastar::now(),
- seastar::make_ready_future<crimson::osd::acked_peers_t>()};
+ return make_ready_future<rep_op_ret_t>(seastar::now(),
+ seastar::make_ready_future<crimson::osd::acked_peers_t>());
}
diff --git a/src/crimson/osd/object_metadata_helper.cc b/src/crimson/osd/object_metadata_helper.cc
index 3016b0ae406..12bf855afbe 100644
--- a/src/crimson/osd/object_metadata_helper.cc
+++ b/src/crimson/osd/object_metadata_helper.cc
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab expandtab
+
#include "crimson/osd/object_metadata_helper.h"
namespace {
@@ -23,12 +26,26 @@ subsets_t calc_clone_subsets(
subsets_t subsets;
logger().debug("{}: {} clone_overlap {} ",
__func__, soid, snapset.clone_overlap);
-
+ assert(missing.get_items().contains(soid));
+ const pg_missing_item &missing_item = missing.get_items().at(soid);
+ auto dirty_regions = missing_item.clean_regions.get_dirty_regions();
+ if (dirty_regions.empty()) {
+ logger().debug(
+ "{} {} not touched, no need to recover, skipping",
+ __func__,
+ soid);
+ return subsets;
+ }
uint64_t size = snapset.clone_size[soid.snap];
if (size) {
subsets.data_subset.insert(0, size);
}
+ // let data_subset store only the modified content of the object.
+ subsets.data_subset.intersection_of(dirty_regions);
+ logger().debug("{} {} data_subset {}",
+ __func__, soid, subsets.data_subset);
+
// TODO: make sure CEPH_FEATURE_OSD_CACHEPOOL is not supported in Crimson
// Skips clone subsets if caching was enabled (allow_incomplete_clones).
@@ -140,7 +157,7 @@ subsets_t calc_head_subsets(
subsets.data_subset.insert(0, obj_size);
}
assert(missing.get_items().contains(head));
- const pg_missing_item missing_item = missing.get_items().at(head);
+ const pg_missing_item &missing_item = missing.get_items().at(head);
// let data_subset store only the modified content of the object.
subsets.data_subset.intersection_of(missing_item.clean_regions.get_dirty_regions());
logger().debug("{} {} data_subset {}",
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 854c4eab6af..df4f73d4077 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -844,57 +844,6 @@ std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction(
return log_entries;
}
-OpsExecuter::interruptible_future<> OpsExecuter::snap_map_remove(
- const hobject_t& soid,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn)
-{
- logger().debug("{}: soid {}", __func__, soid);
- return interruptor::async([soid, &snap_mapper,
- _t=osdriver.get_transaction(&txn)]() mutable {
- const auto r = snap_mapper.remove_oid(soid, &_t);
- if (r) {
- logger().error("{}: remove_oid {} failed with {}",
- __func__, soid, r);
- }
- // On removal tolerate missing key corruption
- assert(r == 0 || r == -ENOENT);
- });
-}
-
-OpsExecuter::interruptible_future<> OpsExecuter::snap_map_modify(
- const hobject_t& soid,
- const std::set<snapid_t>& snaps,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn)
-{
- logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
- return interruptor::async([soid, snaps, &snap_mapper,
- _t=osdriver.get_transaction(&txn)]() mutable {
- assert(std::size(snaps) > 0);
- [[maybe_unused]] const auto r = snap_mapper.update_snaps(
- soid, snaps, 0, &_t);
- assert(r == 0);
- });
-}
-
-OpsExecuter::interruptible_future<> OpsExecuter::snap_map_clone(
- const hobject_t& soid,
- const std::set<snapid_t>& snaps,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn)
-{
- logger().debug("{}: soid {}, snaps {}", __func__, soid, snaps);
- return interruptor::async([soid, snaps, &snap_mapper,
- _t=osdriver.get_transaction(&txn)]() mutable {
- assert(std::size(snaps) > 0);
- snap_mapper.add_oid(soid, snaps, &_t);
- });
-}
-
// Defined here because there is a circular dependency between OpsExecuter and PG
uint32_t OpsExecuter::get_pool_stripe_width() const {
return pg->get_pgpool().info.get_stripe_width();
@@ -974,6 +923,7 @@ std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
0
};
encode(cloned_snaps, cloning_ctx->log_entry.snaps);
+ cloning_ctx->log_entry.clean_regions.mark_data_region_dirty(0, initial_obs.oi.size);
return cloning_ctx;
}
@@ -1019,14 +969,6 @@ OpsExecuter::flush_clone_metadata(
update_clone_overlap();
if (cloning_ctx) {
std::move(*cloning_ctx).apply_to(log_entries, *obc);
- const auto& coid = log_entries.front().soid;
- const auto& cloned_snaps = obc->ssc->snapset.clone_snaps[coid.snap];
- maybe_snap_mapped = snap_map_clone(
- coid,
- std::set<snapid_t>{std::begin(cloned_snaps), std::end(cloned_snaps)},
- snap_mapper,
- osdriver,
- txn);
}
if (snapc.seq > obc->ssc->snapset.seq) {
// update snapset with latest snap context
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index 834266ce68f..e0e5e10e0a9 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -21,6 +21,7 @@
#include "os/Transaction.h"
#include "osd/osd_types.h"
+#include "crimson/common/coroutine.h"
#include "crimson/common/errorator.h"
#include "crimson/common/interruptible_future.h"
#include "crimson/common/type_helpers.h"
@@ -272,24 +273,7 @@ private:
OSDriver& osdriver,
ceph::os::Transaction& txn);
- static interruptible_future<> snap_map_remove(
- const hobject_t& soid,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn);
- static interruptible_future<> snap_map_modify(
- const hobject_t& soid,
- const std::set<snapid_t>& snaps,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn);
- static interruptible_future<> snap_map_clone(
- const hobject_t& soid,
- const std::set<snapid_t>& snaps,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn);
-
+private:
// this gizmo could be wrapped in std::optional for the sake of lazy
// initialization. we don't need it for ops that doesn't have effect
// TODO: verify the init overhead of chunked_fifo
@@ -424,7 +408,7 @@ public:
const std::vector<OSDOp>& ops,
SnapMapper& snap_mapper,
OSDriver& osdriver,
- MutFunc&& mut_func) &&;
+ MutFunc mut_func) &&;
std::vector<pg_log_entry_t> prepare_transaction(
const std::vector<OSDOp>& ops);
void fill_op_params(modified_by m);
@@ -510,60 +494,60 @@ OpsExecuter::flush_changes_n_do_ops_effects(
const std::vector<OSDOp>& ops,
SnapMapper& snap_mapper,
OSDriver& osdriver,
- MutFunc&& mut_func) &&
+ MutFunc mut_func) &&
{
const bool want_mutate = !txn.empty();
// osd_op_params are instantiated by every wr-like operation.
assert(osd_op_params || !want_mutate);
assert(obc);
- rep_op_fut_t maybe_mutated =
- interruptor::make_ready_future<rep_op_fut_tuple>(
- seastar::now(),
- interruptor::make_interruptible(osd_op_errorator::now()));
+
+ auto submitted = interruptor::now();
+ auto all_completed =
+ interruptor::make_interruptible(osd_op_errorator::now());
+
if (cloning_ctx) {
ceph_assert(want_mutate);
}
+
if (want_mutate) {
- maybe_mutated = flush_clone_metadata(
+ auto log_entries = co_await flush_clone_metadata(
prepare_transaction(ops),
snap_mapper,
osdriver,
- txn
- ).then_interruptible([mut_func=std::move(mut_func),
- this](auto&& log_entries) mutable {
- if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) {
- ceph_assert(log_rit->version == osd_op_params->at_version);
- }
- auto [submitted, all_completed] =
- std::forward<MutFunc>(mut_func)(std::move(txn),
- std::move(obc),
- std::move(*osd_op_params),
- std::move(log_entries));
- return interruptor::make_ready_future<rep_op_fut_tuple>(
- std::move(submitted),
- osd_op_ierrorator::future<>(std::move(all_completed)));
- });
+ txn);
+
+ if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) {
+ ceph_assert(log_rit->version == osd_op_params->at_version);
+ }
+
+ auto [_submitted, _all_completed] = co_await mut_func(
+ std::move(txn),
+ std::move(obc),
+ std::move(*osd_op_params),
+ std::move(log_entries));
+
+ submitted = std::move(_submitted);
+ all_completed = std::move(_all_completed);
}
+
apply_stats();
- if (__builtin_expect(op_effects.empty(), true)) {
- return maybe_mutated;
- } else {
- return maybe_mutated.then_unpack_interruptible(
- // need extra ref pg due to apply_stats() which can be executed after
- // informing snap mapper
- [this, pg=this->pg](auto&& submitted, auto&& all_completed) mutable {
- return interruptor::make_ready_future<rep_op_fut_tuple>(
- std::move(submitted),
- all_completed.safe_then_interruptible([this, pg=std::move(pg)] {
- // let's do the cleaning of `op_effects` in destructor
- return interruptor::do_for_each(op_effects,
- [pg=std::move(pg)](auto& op_effect) {
- return op_effect->execute(pg);
- });
- }));
+ if (op_effects.size()) [[unlikely]] {
+ // need extra ref pg due to apply_stats() which can be executed after
+ // informing snap mapper
+ all_completed =
+ std::move(all_completed).safe_then_interruptible([this, pg=this->pg] {
+ // let's do the cleaning of `op_effects` in destructor
+ return interruptor::do_for_each(op_effects,
+ [pg=std::move(pg)](auto& op_effect) {
+ return op_effect->execute(pg);
+ });
});
}
+
+ co_return std::make_tuple(
+ std::move(submitted),
+ std::move(all_completed));
}
template <class Func>
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 190ea47abd8..8d2d10fbd7c 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -718,7 +718,7 @@ seastar::future<> OSD::stop()
DEBUG("prepared to stop");
public_msgr->stop();
cluster_msgr->stop();
- auto gate_close_fut = gate.close();
+ auto gate_close_fut = gate.close_all();
return asok->stop().then([this] {
return heartbeat->stop();
}).then([this] {
@@ -1557,10 +1557,18 @@ seastar::future<> OSD::handle_peering_op(
seastar::future<> OSD::check_osdmap_features()
{
+ LOG_PREFIX(OSD::check_osdmap_features);
assert(seastar::this_shard_id() == PRIMARY_CORE);
- return store.write_meta(
- "require_osd_release",
- stringify((int)osdmap->require_osd_release));
+ if (osdmap->require_osd_release != last_require_osd_release) {
+ DEBUG("updating require_osd_release from {} to {}",
+ to_string(last_require_osd_release),
+ to_string(osdmap->require_osd_release));
+ last_require_osd_release = osdmap->require_osd_release;
+ return store.write_meta(
+ "require_osd_release",
+ stringify((int)osdmap->require_osd_release));
+ }
+ return seastar::now();
}
seastar::future<> OSD::prepare_to_stop()
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index 8df23c53f7a..de39d808274 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -232,7 +232,9 @@ private:
Ref<MOSDPGUpdateLogMissingReply> m);
private:
- crimson::common::Gated gate;
+ crimson::common::gate_per_shard gate;
+
+ ceph_release_t last_require_osd_release{ceph_release_t::unknown};
seastar::promise<> stop_acked;
void got_stop_ack() {
diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc
index 509d4c4a484..c030c9d8970 100644
--- a/src/crimson/osd/osd_operations/background_recovery.cc
+++ b/src/crimson/osd/osd_operations/background_recovery.cc
@@ -158,6 +158,8 @@ PglogBasedRecovery::PglogBasedRecovery(
PglogBasedRecovery::interruptible_future<bool>
PglogBasedRecovery::do_recovery()
{
+ LOG_PREFIX(PglogBasedRecovery::do_recovery);
+ DEBUGDPPI("{}: {}", *pg, __func__, *this);
if (pg->has_reset_since(epoch_started)) {
return seastar::make_ready_future<bool>(false);
}
@@ -167,6 +169,7 @@ PglogBasedRecovery::do_recovery()
interruptor>([this] (auto&& trigger) {
return pg->get_recovery_handler()->start_recovery_ops(
trigger,
+ *this,
crimson::common::local_conf()->osd_recovery_max_single_start);
});
});
diff --git a/src/crimson/osd/osd_operations/background_recovery.h b/src/crimson/osd/osd_operations/background_recovery.h
index 17f2cd57a30..5ae0e1a9edb 100644
--- a/src/crimson/osd/osd_operations/background_recovery.h
+++ b/src/crimson/osd/osd_operations/background_recovery.h
@@ -91,8 +91,20 @@ public:
RecoveryBackend::RecoveryBlockingEvent
> tracking_events;
+ void cancel() {
+ cancelled = true;
+ }
+
+ bool is_cancelled() const {
+ return cancelled;
+ }
+
+ epoch_t get_epoch_started() const {
+ return epoch_started;
+ }
private:
interruptible_future<bool> do_recovery() override;
+ bool cancelled = false;
};
class BackfillRecovery final : public BackgroundRecoveryT<BackfillRecovery> {
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index f1247c3fa08..7512b3d108d 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -197,7 +197,7 @@ SnapTrimObjSubEvent::remove_clone(
pg->get_collection_ref()->get_cid(),
ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD});
obc->obs.oi = object_info_t(coid);
- return OpsExecuter::snap_map_remove(coid, pg->snap_mapper, pg->osdriver, txn);
+ return interruptor::now();
}
void SnapTrimObjSubEvent::remove_head_whiteout(
@@ -263,7 +263,7 @@ SnapTrimObjSubEvent::adjust_snaps(
ghobject_t{coid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD},
OI_ATTR,
bl);
- add_log_entry(
+ auto &loge = add_log_entry(
pg_log_entry_t::MODIFY,
coid,
obc->obs.oi.prior_version,
@@ -271,8 +271,10 @@ SnapTrimObjSubEvent::adjust_snaps(
osd_reqid_t(),
obc->obs.oi.mtime,
0);
- return OpsExecuter::snap_map_modify(
- coid, new_snaps, pg->snap_mapper, pg->osdriver, txn);
+ bufferlist snapsbl;
+ encode(new_snaps, snapsbl);
+ loge.snaps.swap(snapsbl);
+ return interruptor::now();
}
void SnapTrimObjSubEvent::update_head(
@@ -361,6 +363,7 @@ SnapTrimObjSubEvent::remove_or_update(
// save head snapset
logger().debug("{}: {} new snapset {} on {}",
*this, coid, head_obc->ssc->snapset, head_obc->obs.oi);
+ osd_op_p.at_version.version++;
if (head_obc->ssc->snapset.clones.empty() && head_obc->obs.oi.is_whiteout()) {
remove_head_whiteout(obc, head_obc, txn);
} else {
@@ -400,32 +403,8 @@ SnapTrimObjSubEvent::start()
// lock both clone's and head's obcs
co_await pg->obc_loader.with_obc<RWState::RWWRITE>(
coid,
- [this](auto head_obc, auto clone_obc) {
- logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
- return enter_stage<interruptor>(
- client_pp().process
- ).then_interruptible(
- [this,clone_obc=std::move(clone_obc), head_obc=std::move(head_obc)]() mutable {
- logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
- return remove_or_update(
- clone_obc, head_obc
- ).safe_then_interruptible([clone_obc, this](auto&& txn) mutable {
- auto [submitted, all_completed] = pg->submit_transaction(
- std::move(clone_obc),
- std::move(txn),
- std::move(osd_op_p),
- std::move(log_entries));
- return submitted.then_interruptible(
- [this, all_completed=std::move(all_completed)]() mutable {
- return enter_stage<interruptor>(
- client_pp().wait_repop
- ).then_interruptible([all_completed=std::move(all_completed)]() mutable{
- return std::move(all_completed);
- });
- });
- });
- });
- },
+ std::bind(&SnapTrimObjSubEvent::process_and_submit,
+ this, std::placeholders::_1, std::placeholders::_2),
false
).handle_error_interruptible(
remove_or_update_iertr::pass_further{},
@@ -436,6 +415,33 @@ SnapTrimObjSubEvent::start()
co_await interruptor::make_interruptible(handle.complete());
}
+ObjectContextLoader::load_obc_iertr::future<>
+SnapTrimObjSubEvent::process_and_submit(ObjectContextRef head_obc,
+ ObjectContextRef clone_obc) {
+ logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
+
+ co_await enter_stage<interruptor>(client_pp().process);
+
+ logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
+
+ auto txn = co_await remove_or_update(clone_obc, head_obc);
+
+ auto [submitted, all_completed] = co_await pg->submit_transaction(
+ std::move(clone_obc),
+ std::move(txn),
+ std::move(osd_op_p),
+ std::move(log_entries)
+ );
+
+ co_await std::move(submitted);
+
+ co_await enter_stage<interruptor>(client_pp().wait_repop);
+
+ co_await std::move(all_completed);
+
+ co_return;
+}
+
void SnapTrimObjSubEvent::print(std::ostream &lhs) const
{
lhs << "SnapTrimObjSubEvent("
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index 0e6c1e9d8fc..06d8f43c2f3 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -113,6 +113,10 @@ public:
private:
object_stat_sum_t delta_stats;
+ ObjectContextLoader::load_obc_iertr::future<> process_and_submit(
+ ObjectContextRef head_obc,
+ ObjectContextRef clone_obc);
+
snap_trim_obj_subevent_ret_t remove_clone(
ObjectContextRef obc,
ObjectContextRef head_obc,
@@ -134,7 +138,7 @@ private:
remove_or_update_iertr::future<ceph::os::Transaction>
remove_or_update(ObjectContextRef obc, ObjectContextRef head_obc);
- void add_log_entry(
+ pg_log_entry_t& add_log_entry(
int _op,
const hobject_t& _soid,
const eversion_t& pv,
@@ -151,7 +155,7 @@ private:
rid,
mt,
return_code);
- osd_op_p.at_version.version++;
+ return log_entries.back();
}
Ref<PG> pg;
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index ad34373c74f..644cc84513d 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -27,6 +27,7 @@
#include "os/Transaction.h"
+#include "crimson/common/coroutine.h"
#include "crimson/common/exception.h"
#include "crimson/common/log.h"
#include "crimson/net/Connection.h"
@@ -514,6 +515,7 @@ PG::do_delete_work(ceph::os::Transaction &t, ghobject_t _next)
Context *PG::on_clean()
{
+ recovery_handler->on_pg_clean();
scrubber.on_primary_active_clean();
return nullptr;
}
@@ -893,8 +895,9 @@ void PG::mutate_object(
}
}
-std::tuple<PG::interruptible_future<>,
- PG::interruptible_future<>>
+PG::interruptible_future<
+ std::tuple<PG::interruptible_future<>,
+ PG::interruptible_future<>>>
PG::submit_transaction(
ObjectContextRef&& obc,
ceph::os::Transaction&& txn,
@@ -902,9 +905,10 @@ PG::submit_transaction(
std::vector<pg_log_entry_t>&& log_entries)
{
if (__builtin_expect(stopping, false)) {
- return {seastar::make_exception_future<>(
- crimson::common::system_shutdown_exception()),
- seastar::now()};
+ co_return std::make_tuple(
+ interruptor::make_interruptible(seastar::make_exception_future<>(
+ crimson::common::system_shutdown_exception())),
+ interruptor::now());
}
epoch_t map_epoch = get_osdmap_epoch();
@@ -916,7 +920,7 @@ PG::submit_transaction(
ceph_assert(log_entries.rbegin()->version >= projected_last_update);
projected_last_update = log_entries.rbegin()->version;
- auto [submitted, all_completed] = backend->submit_transaction(
+ auto [submitted, all_completed] = co_await backend->submit_transaction(
peering_state.get_acting_recovery_backfill(),
obc->obs.oi.soid,
std::move(txn),
@@ -924,16 +928,19 @@ PG::submit_transaction(
peering_state.get_last_peering_reset(),
map_epoch,
std::move(log_entries));
- return std::make_tuple(std::move(submitted), all_completed.then_interruptible(
- [this, last_complete=peering_state.get_info().last_complete,
+ co_return std::make_tuple(
+ std::move(submitted),
+ all_completed.then_interruptible(
+ [this, last_complete=peering_state.get_info().last_complete,
at_version=osd_op_p.at_version](auto acked) {
- for (const auto& peer : acked) {
- peering_state.update_peer_last_complete_ondisk(
- peer.shard, peer.last_complete_ondisk);
- }
- peering_state.complete_write(at_version, last_complete);
- return seastar::now();
- }));
+ for (const auto& peer : acked) {
+ peering_state.update_peer_last_complete_ondisk(
+ peer.shard, peer.last_complete_ondisk);
+ }
+ peering_state.complete_write(at_version, last_complete);
+ return seastar::now();
+ })
+ );
}
PG::interruptible_future<> PG::repair_object(
@@ -1439,14 +1446,10 @@ void PG::update_stats(const pg_stat_t &stat) {
PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
{
- if (__builtin_expect(stopping, false)) {
- return seastar::make_exception_future<>(
- crimson::common::system_shutdown_exception());
- }
-
- logger().debug("{}: {}", __func__, *req);
+ LOG_PREFIX(PG::handle_rep_op);
+ DEBUGDPP("{}", *this, *req);
if (can_discard_replica_op(*req)) {
- return seastar::now();
+ co_return;
}
ceph::os::Transaction txn;
@@ -1456,6 +1459,11 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
std::vector<pg_log_entry_t> log_entries;
decode(log_entries, p);
update_stats(req->pg_stats);
+
+ co_await update_snap_map(
+ log_entries,
+ txn);
+
log_operation(std::move(log_entries),
req->pg_trim_to,
req->version,
@@ -1463,18 +1471,42 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
!txn.empty(),
txn,
false);
- logger().debug("PG::handle_rep_op: do_transaction...");
- return interruptor::make_interruptible(shard_services.get_store().do_transaction(
- coll_ref, std::move(txn))).then_interruptible(
- [req, lcod=peering_state.get_info().last_complete, this] {
- peering_state.update_last_complete_ondisk(lcod);
- const auto map_epoch = get_osdmap_epoch();
- auto reply = crimson::make_message<MOSDRepOpReply>(
- req.get(), pg_whoami, 0,
- map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
- reply->set_last_complete_ondisk(lcod);
- return shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch);
- });
+ DEBUGDPP("{} do_transaction", *this, *req);
+ co_await interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(coll_ref, std::move(txn))
+ );
+
+ const auto &lcod = peering_state.get_info().last_complete;
+ peering_state.update_last_complete_ondisk(lcod);
+ const auto map_epoch = get_osdmap_epoch();
+ auto reply = crimson::make_message<MOSDRepOpReply>(
+ req.get(), pg_whoami, 0,
+ map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+ reply->set_last_complete_ondisk(lcod);
+ co_await interruptor::make_interruptible(
+ shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch)
+ );
+ co_return;
+}
+
+PG::interruptible_future<> PG::update_snap_map(
+ const std::vector<pg_log_entry_t> &log_entries,
+ ObjectStore::Transaction& t)
+{
+ LOG_PREFIX(PG::update_snap_map);
+ DEBUGDPP("", *this);
+ return interruptor::do_for_each(
+ log_entries,
+ [this, &t](const auto& entry) mutable {
+ if (entry.soid.snap < CEPH_MAXSNAP) {
+ // TODO: avoid seastar::async https://tracker.ceph.com/issues/67704
+ return interruptor::async(
+ [this, entry, _t=osdriver.get_transaction(&t)]() mutable {
+ snap_mapper.update_snap_map(entry, &_t);
+ });
+ }
+ return interruptor::now();
+ });
}
void PG::log_operation(
@@ -1496,10 +1528,6 @@ void PG::log_operation(
* handles these cases.
*/
#if 0
- if (transaction_applied) {
- //TODO:
- //update_snap_map(logv, t);
- }
auto last = logv.rbegin();
if (is_primary() && last != logv.rend()) {
projected_log.skip_can_rollback_to_to_head();
@@ -1700,6 +1728,7 @@ void PG::on_change(ceph::os::Transaction &t) {
peering_state.state_clear(PG_STATE_SNAPTRIM);
peering_state.state_clear(PG_STATE_SNAPTRIM_ERROR);
snap_mapper.reset_backend();
+ reset_pglog_based_recovery_op();
}
void PG::context_registry_on_change() {
@@ -1839,4 +1868,19 @@ void PG::PGLogEntryHandler::remove(const hobject_t &soid) {
DEBUGDPP("remove {} on pglog rollback", *pg, soid);
pg->remove_maybe_snapmapped_object(*t, soid);
}
+
+void PG::set_pglog_based_recovery_op(PglogBasedRecovery *op) {
+ ceph_assert(!pglog_based_recovery_op);
+ pglog_based_recovery_op = op;
+}
+
+void PG::reset_pglog_based_recovery_op() {
+ pglog_based_recovery_op = nullptr;
+}
+
+void PG::cancel_pglog_based_recovery_op() {
+ ceph_assert(pglog_based_recovery_op);
+ pglog_based_recovery_op->cancel();
+ reset_pglog_based_recovery_op();
+}
}
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index b1e5e1fa22a..11c0e3668b1 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -64,6 +64,7 @@ namespace crimson::osd {
class OpsExecuter;
class BackfillRecovery;
class SnapTrimEvent;
+class PglogBasedRecovery;
class PG : public boost::intrusive_ref_counter<
PG,
@@ -433,6 +434,10 @@ public:
recovery_handler->backfill_cancelled();
}
+ void on_recovery_cancelled() final {
+ cancel_pglog_based_recovery_op();
+ }
+
void on_recovery_reserved() final {
recovery_handler->start_pglogbased_recovery();
}
@@ -591,6 +596,9 @@ public:
interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m);
void update_stats(const pg_stat_t &stat);
+ interruptible_future<> update_snap_map(
+ const std::vector<pg_log_entry_t> &log_entries,
+ ObjectStore::Transaction& t);
void log_operation(
std::vector<pg_log_entry_t>&& logv,
const eversion_t &trim_to,
@@ -673,7 +681,8 @@ private:
SuccessFunc&& success_func,
FailureFunc&& failure_func);
interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
- std::tuple<interruptible_future<>, interruptible_future<>>
+ interruptible_future<
+ std::tuple<interruptible_future<>, interruptible_future<>>>
submit_transaction(
ObjectContextRef&& obc,
ceph::os::Transaction&& txn,
@@ -838,6 +847,10 @@ public:
return can_discard_replica_op(m, m.get_map_epoch());
}
+ void set_pglog_based_recovery_op(PglogBasedRecovery *op) final;
+ void reset_pglog_based_recovery_op() final;
+ void cancel_pglog_based_recovery_op();
+
private:
// instead of seastar::gate, we use a boolean flag to indicate
// whether the system is shutting down, as we don't need to track
@@ -845,6 +858,7 @@ private:
bool stopping = false;
PGActivationBlocker wait_for_active_blocker;
+ PglogBasedRecovery* pglog_based_recovery_op = nullptr;
friend std::ostream& operator<<(std::ostream&, const PG& pg);
friend class ClientRequest;
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
index 21dce24b899..fa1f1405ffe 100644
--- a/src/crimson/osd/pg_backend.h
+++ b/src/crimson/osd/pg_backend.h
@@ -60,9 +60,10 @@ public:
using interruptible_future =
::crimson::interruptible::interruptible_future<
::crimson::osd::IOInterruptCondition, T>;
- using rep_op_fut_t =
+ using rep_op_ret_t =
std::tuple<interruptible_future<>,
interruptible_future<crimson::osd::acked_peers_t>>;
+ using rep_op_fut_t = interruptible_future<rep_op_ret_t>;
PGBackend(shard_id_t shard, CollectionRef coll,
crimson::osd::ShardServices &shard_services,
DoutPrefixProvider &dpp);
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index b583c87a3f6..4f874d526b3 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -24,29 +24,33 @@ namespace {
using std::map;
using std::set;
+using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
void PGRecovery::start_pglogbased_recovery()
{
- using PglogBasedRecovery = crimson::osd::PglogBasedRecovery;
- (void) pg->get_shard_services().start_operation<PglogBasedRecovery>(
+ auto [op, fut] = pg->get_shard_services().start_operation<PglogBasedRecovery>(
static_cast<crimson::osd::PG*>(pg),
pg->get_shard_services(),
pg->get_osdmap_epoch(),
float(0.001));
+ pg->set_pglog_based_recovery_op(op.get());
}
PGRecovery::interruptible_future<bool>
PGRecovery::start_recovery_ops(
RecoveryBackend::RecoveryBlockingEvent::TriggerI& trigger,
+ PglogBasedRecovery &recover_op,
size_t max_to_start)
{
assert(pg->is_primary());
assert(pg->is_peered());
- if (!pg->is_recovering() && !pg->is_backfilling()) {
- logger().debug("recovery raced and were queued twice, ignoring!");
+ if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+ recover_op.is_cancelled()) {
+ logger().debug("recovery {} cancelled.", recover_op);
return seastar::make_ready_future<bool>(false);
}
+ ceph_assert(pg->is_recovering());
// in ceph-osd the do_recovery() path handles both the pg log-based
// recovery and the backfill, albeit they are separated at the layer
@@ -68,12 +72,15 @@ PGRecovery::start_recovery_ops(
return interruptor::parallel_for_each(started,
[] (auto&& ifut) {
return std::move(ifut);
- }).then_interruptible([this] {
+ }).then_interruptible([this, &recover_op] {
//TODO: maybe we should implement a recovery race interruptor in the future
- if (!pg->is_recovering() && !pg->is_backfilling()) {
- logger().debug("recovery raced and were queued twice, ignoring!");
+ if (pg->has_reset_since(recover_op.get_epoch_started()) ||
+ recover_op.is_cancelled()) {
+ logger().debug("recovery {} cancelled.", recover_op);
return seastar::make_ready_future<bool>(false);
}
+ ceph_assert(pg->is_recovering());
+ ceph_assert(!pg->is_backfilling());
bool done = !pg->get_peering_state().needs_recovery();
if (done) {
@@ -101,6 +108,7 @@ PGRecovery::start_recovery_ops(
pg->get_osdmap_epoch(),
PeeringState::RequestBackfill{});
}
+ pg->reset_pglog_based_recovery_op();
}
return seastar::make_ready_future<bool>(!done);
});
@@ -603,9 +611,13 @@ bool PGRecovery::budget_available() const
return true;
}
-void PGRecovery::backfilled()
+void PGRecovery::on_pg_clean()
{
backfill_state.reset();
+}
+
+void PGRecovery::backfilled()
+{
using LocalPeeringEvent = crimson::osd::LocalPeeringEvent;
std::ignore = pg->get_shard_services().start_operation<LocalPeeringEvent>(
static_cast<crimson::osd::PG*>(pg),
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index c2f289e3cf6..6cd29c3dc52 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -17,6 +17,7 @@
namespace crimson::osd {
class UrgentRecovery;
+class PglogBasedRecovery;
}
class MOSDPGBackfillRemove;
@@ -32,6 +33,7 @@ public:
interruptible_future<bool> start_recovery_ops(
RecoveryBackend::RecoveryBlockingEvent::TriggerI&,
+ crimson::osd::PglogBasedRecovery &recover_op,
size_t max_to_start);
void on_activate_complete();
void on_backfill_reserved();
@@ -42,6 +44,7 @@ public:
}
seastar::future<> stop() { return seastar::now(); }
+ void on_pg_clean();
private:
PGRecoveryListener* pg;
size_t start_primary_recovery_ops(
diff --git a/src/crimson/osd/pg_recovery_listener.h b/src/crimson/osd/pg_recovery_listener.h
index a53221c4a69..6c88b170b4f 100644
--- a/src/crimson/osd/pg_recovery_listener.h
+++ b/src/crimson/osd/pg_recovery_listener.h
@@ -11,6 +11,7 @@
namespace crimson::osd {
class ShardServices;
+ class PglogBasedRecovery;
};
class RecoveryBackend;
@@ -38,4 +39,7 @@ public:
virtual void publish_stats_to_osd() = 0;
virtual OSDriver &get_osdriver() = 0;
virtual SnapMapper &get_snap_mapper() = 0;
+ virtual void set_pglog_based_recovery_op(
+ crimson::osd::PglogBasedRecovery *op) = 0;
+ virtual void reset_pglog_based_recovery_op() = 0;
};
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
index 6ec8d30d596..cbb8c883e07 100644
--- a/src/crimson/osd/replicated_backend.cc
+++ b/src/crimson/osd/replicated_backend.cc
@@ -5,6 +5,7 @@
#include "messages/MOSDRepOpReply.h"
+#include "crimson/common/coroutine.h"
#include "crimson/common/exception.h"
#include "crimson/common/log.h"
#include "crimson/os/futurized_store.h"
@@ -38,13 +39,16 @@ ReplicatedBackend::_read(const hobject_t& hoid,
ReplicatedBackend::rep_op_fut_t
ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
const hobject_t& hoid,
- ceph::os::Transaction&& txn,
- osd_op_params_t&& osd_op_p,
+ ceph::os::Transaction&& t,
+ osd_op_params_t&& opp,
epoch_t min_epoch, epoch_t map_epoch,
- std::vector<pg_log_entry_t>&& log_entries)
+ std::vector<pg_log_entry_t>&& logv)
{
LOG_PREFIX(ReplicatedBackend::submit_transaction);
DEBUGDPP("object {}", dpp, hoid);
+ auto log_entries = std::move(logv);
+ auto txn = std::move(t);
+ auto osd_op_p = std::move(opp);
const ceph_tid_t tid = shard_services.get_tid();
auto pending_txn =
@@ -89,6 +93,8 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
}
}
+ co_await pg.update_snap_map(log_entries, txn);
+
pg.log_operation(
std::move(log_entries),
osd_op_p.pg_trim_to,
@@ -99,8 +105,8 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
false);
auto all_completed = interruptor::make_interruptible(
- shard_services.get_store().do_transaction(coll, std::move(txn))
- ).then_interruptible([FNAME, this,
+ shard_services.get_store().do_transaction(coll, std::move(txn))
+ ).then_interruptible([FNAME, this,
peers=pending_txn->second.weak_from_this()] {
if (!peers) {
// for now, only actingset_changed can cause peers
@@ -117,13 +123,14 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
}).then_interruptible([pending_txn, this] {
auto acked_peers = std::move(pending_txn->second.acked_peers);
pending_trans.erase(pending_txn);
- return seastar::make_ready_future<crimson::osd::acked_peers_t>(std::move(acked_peers));
+ return seastar::make_ready_future<
+ crimson::osd::acked_peers_t>(std::move(acked_peers));
});
auto sends_complete = seastar::when_all_succeed(
sends->begin(), sends->end()
).finally([sends=std::move(sends)] {});
- return {std::move(sends_complete), std::move(all_completed)};
+ co_return std::make_tuple(std::move(sends_complete), std::move(all_completed));
}
void ReplicatedBackend::on_actingset_changed(bool same_primary)
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 57eb18b0d3e..627f4a3e85b 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -440,6 +440,7 @@ enum {
CEPH_MDS_OP_QUIESCE_PATH = 0x01508,
CEPH_MDS_OP_QUIESCE_INODE = 0x01509,
CEPH_MDS_OP_LOCK_PATH = 0x0150a,
+ CEPH_MDS_OP_UNINLINE_DATA = 0x0150b
};
#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE || \
diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h
index 314dbfad6ca..73efc73ae9a 100644
--- a/src/include/cephfs/types.h
+++ b/src/include/cephfs/types.h
@@ -861,6 +861,8 @@ void inode_t<Allocator>::dump(ceph::Formatter *f) const
f->dump_unsigned("file_data_version", file_data_version);
f->dump_unsigned("xattr_version", xattr_version);
f->dump_unsigned("backtrace_version", backtrace_version);
+ f->dump_unsigned("inline_data_version", inline_data.version);
+ f->dump_unsigned("inline_data_length", inline_data.length());
f->dump_string("stray_prior_path", stray_prior_path);
f->dump_unsigned("max_size_ever", max_size_ever);
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
index b10ea7c27cb..48358fce936 100644
--- a/src/include/config-h.in.cmake
+++ b/src/include/config-h.in.cmake
@@ -393,6 +393,9 @@
/* Define if libcryptsetup can be used (linux only) */
#cmakedefine HAVE_LIBCRYPTSETUP
+/* Define if libnbd can be used */
+#cmakedefine HAVE_LIBNBD
+
/* Shared library extension, such as .so, .dll or .dylib */
#cmakedefine CMAKE_SHARED_LIBRARY_SUFFIX "@CMAKE_SHARED_LIBRARY_SUFFIX@"
diff --git a/src/librbd/CMakeLists.txt b/src/librbd/CMakeLists.txt
index 3ba46028f0f..b66e5ead40a 100644
--- a/src/librbd/CMakeLists.txt
+++ b/src/librbd/CMakeLists.txt
@@ -77,6 +77,7 @@ set(librbd_internal_srcs
exclusive_lock/PostAcquireRequest.cc
exclusive_lock/PreReleaseRequest.cc
exclusive_lock/StandardPolicy.cc
+ group/ListSnapshotsRequest.cc
image/AttachChildRequest.cc
image/AttachParentRequest.cc
image/CloneRequest.cc
@@ -219,6 +220,11 @@ if(LINUX AND HAVE_LIBCRYPTSETUP)
crypto/luks/Magic.cc)
endif()
+if(HAVE_LIBNBD)
+ list(APPEND librbd_internal_srcs
+ migration/NBDStream.cc)
+endif()
+
add_library(rbd_api STATIC librbd.cc)
add_library(rbd_internal STATIC
${librbd_internal_srcs}
@@ -240,6 +246,10 @@ if(LINUX AND HAVE_LIBCRYPTSETUP)
target_include_directories(rbd_internal PRIVATE ${LIBCRYPTSETUP_INCLUDE_DIR})
target_link_libraries(rbd_internal PRIVATE ${LIBCRYPTSETUP_LIBRARIES})
endif()
+if(HAVE_LIBNBD)
+ target_include_directories(rbd_internal PRIVATE ${LIBNBD_INCLUDE_DIR})
+ target_link_libraries(rbd_internal PRIVATE ${LIBNBD_LIBRARIES})
+endif()
add_custom_target(librbd_plugins)
set(librbd_plugins_dir ${CEPH_INSTALL_PKGLIBDIR}/librbd)
diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc
index bf1c53284f2..97c53abd70f 100644
--- a/src/librbd/api/Group.cc
+++ b/src/librbd/api/Group.cc
@@ -11,6 +11,7 @@
#include "librbd/ImageWatcher.h"
#include "librbd/Operations.h"
#include "librbd/Utils.h"
+#include "librbd/group/ListSnapshotsRequest.h"
#include "librbd/internal.h"
#include "librbd/io/AioCompletion.h"
@@ -53,36 +54,18 @@ snap_t get_group_snap_id(I* ictx,
return CEPH_NOSNAP;
}
+template <typename I>
int group_snap_list(librados::IoCtx& group_ioctx, const std::string& group_id,
- std::vector<cls::rbd::GroupSnapshot> *cls_snaps)
+ bool try_to_sort, bool fail_if_not_sorted,
+ std::vector<cls::rbd::GroupSnapshot> *cls_snaps)
{
- CephContext *cct = (CephContext *)group_ioctx.cct();
-
- string group_header_oid = util::group_header_name(group_id);
-
- const int max_read = 1024;
- cls::rbd::GroupSnapshot snap_last;
- int r;
-
- for (;;) {
- vector<cls::rbd::GroupSnapshot> snaps_page;
-
- r = cls_client::group_snap_list(&group_ioctx, group_header_oid,
- snap_last, max_read, &snaps_page);
-
- if (r < 0) {
- lderr(cct) << "error reading snap list from group: "
- << cpp_strerror(-r) << dendl;
- return r;
- }
- cls_snaps->insert(cls_snaps->end(), snaps_page.begin(), snaps_page.end());
- if (snaps_page.size() < max_read) {
- break;
- }
- snap_last = *snaps_page.rbegin();
- }
-
- return 0;
+ C_SaferCond cond;
+ auto req = group::ListSnapshotsRequest<I>::create(group_ioctx, group_id,
+ try_to_sort,
+ fail_if_not_sorted,
+ cls_snaps, &cond);
+ req->send();
+ return cond.wait();
}
std::string calc_ind_image_snap_name(uint64_t pool_id,
@@ -593,7 +576,7 @@ int Group<I>::remove(librados::IoCtx& io_ctx, const char *group_name)
string group_header_oid = util::group_header_name(group_id);
std::vector<cls::rbd::GroupSnapshot> snaps;
- r = group_snap_list(io_ctx, group_id, &snaps);
+ r = group_snap_list<I>(io_ctx, group_id, false, false, &snaps);
if (r < 0 && r != -ENOENT) {
lderr(cct) << "error listing group snapshots" << dendl;
return r;
@@ -1191,7 +1174,7 @@ int Group<I>::snap_remove(librados::IoCtx& group_ioctx, const char *group_name,
}
std::vector<cls::rbd::GroupSnapshot> snaps;
- r = group_snap_list(group_ioctx, group_id, &snaps);
+ r = group_snap_list<I>(group_ioctx, group_id, false, false, &snaps);
if (r < 0) {
return r;
}
@@ -1232,7 +1215,7 @@ int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
}
std::vector<cls::rbd::GroupSnapshot> group_snaps;
- r = group_snap_list(group_ioctx, group_id, &group_snaps);
+ r = group_snap_list<I>(group_ioctx, group_id, false, false, &group_snaps);
if (r < 0) {
return r;
}
@@ -1261,6 +1244,7 @@ int Group<I>::snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
template <typename I>
int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ bool try_to_sort, bool fail_if_not_sorted,
std::vector<group_snap_info2_t> *group_snaps)
{
CephContext *cct = (CephContext *)group_ioctx.cct();
@@ -1275,7 +1259,8 @@ int Group<I>::snap_list(librados::IoCtx& group_ioctx, const char *group_name,
}
std::vector<cls::rbd::GroupSnapshot> cls_group_snaps;
- r = group_snap_list(group_ioctx, group_id, &cls_group_snaps);
+ r = group_snap_list<I>(group_ioctx, group_id, try_to_sort, fail_if_not_sorted,
+ &cls_group_snaps);
if (r < 0) {
return r;
}
@@ -1311,7 +1296,7 @@ int Group<I>::snap_get_info(librados::IoCtx& group_ioctx,
}
std::vector<cls::rbd::GroupSnapshot> cls_group_snaps;
- r = group_snap_list(group_ioctx, group_id, &cls_group_snaps);
+ r = group_snap_list<I>(group_ioctx, group_id, false, false, &cls_group_snaps);
if (r < 0) {
return r;
}
@@ -1353,7 +1338,7 @@ int Group<I>::snap_rollback(librados::IoCtx& group_ioctx,
}
std::vector<cls::rbd::GroupSnapshot> snaps;
- r = group_snap_list(group_ioctx, group_id, &snaps);
+ r = group_snap_list<I>(group_ioctx, group_id, false, false, &snaps);
if (r < 0) {
return r;
}
diff --git a/src/librbd/api/Group.h b/src/librbd/api/Group.h
index 2a7574612ec..ffbb9afea1a 100644
--- a/src/librbd/api/Group.h
+++ b/src/librbd/api/Group.h
@@ -47,6 +47,7 @@ struct Group {
static int snap_rename(librados::IoCtx& group_ioctx, const char *group_name,
const char *old_snap_name, const char *new_snap_name);
static int snap_list(librados::IoCtx& group_ioctx, const char *group_name,
+ bool try_to_sort, bool fail_if_not_sorted,
std::vector<group_snap_info2_t> *snaps);
static int snap_get_info(librados::IoCtx& group_ioctx,
const char *group_name, const char *snap_name,
diff --git a/src/librbd/group/ListSnapshotsRequest.cc b/src/librbd/group/ListSnapshotsRequest.cc
new file mode 100644
index 00000000000..bbd3759ff2b
--- /dev/null
+++ b/src/librbd/group/ListSnapshotsRequest.cc
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/group/ListSnapshotsRequest.h"
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/ceph_context.h"
+#include "cls/rbd/cls_rbd_client.h"
+#include "librbd/Utils.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::group::ListSnapshotsRequest: " << this \
+ << " " << __func__ << ": "
+
+namespace librbd {
+namespace group {
+
+namespace {
+
+const uint32_t MAX_RETURN = 1024;
+
+} // anonymous namespace
+
+template <typename I>
+ListSnapshotsRequest<I>::ListSnapshotsRequest(librados::IoCtx &group_io_ctx,
+ const std::string &group_id,
+ bool try_to_sort,
+ bool fail_if_not_sorted,
+ std::vector<cls::rbd::GroupSnapshot> *snaps,
+ Context *on_finish)
+ : m_group_io_ctx(group_io_ctx), m_group_id(group_id),
+ m_try_to_sort(try_to_sort), m_fail_if_not_sorted(fail_if_not_sorted),
+ m_snaps(snaps), m_on_finish(on_finish) {
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 20) << "group_id=" << m_group_id
+ << ", try_to_sort=" << m_try_to_sort
+ << ", fail_if_not_sorted=" << m_fail_if_not_sorted
+ << dendl;
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::send() {
+ list_snap_orders();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::list_snap_orders() {
+ if (!m_try_to_sort) {
+ list_snaps();
+ return;
+ }
+
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::group_snap_list_order_start(&op, m_start_after_order, MAX_RETURN);
+ auto comp = util::create_rados_callback<
+ ListSnapshotsRequest<I>,
+ &ListSnapshotsRequest<I>::handle_list_snap_orders>(this);
+ m_out_bl.clear();
+ int r = m_group_io_ctx.aio_operate(util::group_header_name(m_group_id), comp,
+ &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::handle_list_snap_orders(int r) {
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ std::map<std::string, uint64_t> snap_orders;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::group_snap_list_order_finish(&iter, &snap_orders);
+ }
+
+ if (r < 0) {
+ if (r == -EOPNOTSUPP && !m_fail_if_not_sorted) {
+ list_snaps();
+ return;
+ } else {
+ lderr(cct) << "failed to get group snapshot orders: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+ }
+
+ m_snap_orders.insert(snap_orders.begin(), snap_orders.end());
+ if (snap_orders.size() < MAX_RETURN) {
+ list_snaps();
+ return;
+ }
+
+ m_start_after_order = snap_orders.rbegin()->first;
+ list_snap_orders();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::list_snaps() {
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ librados::ObjectReadOperation op;
+ cls_client::group_snap_list_start(&op, m_start_after, MAX_RETURN);
+ auto comp = util::create_rados_callback<
+ ListSnapshotsRequest<I>,
+ &ListSnapshotsRequest<I>::handle_list_snaps>(this);
+ m_out_bl.clear();
+ int r = m_group_io_ctx.aio_operate(util::group_header_name(m_group_id), comp,
+ &op, &m_out_bl);
+ ceph_assert(r == 0);
+ comp->release();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::handle_list_snaps(int r) {
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ std::vector<cls::rbd::GroupSnapshot> snaps;
+ if (r == 0) {
+ auto iter = m_out_bl.cbegin();
+ r = cls_client::group_snap_list_finish(&iter, &snaps);
+ }
+
+ if (r < 0) {
+ lderr(cct) << "failed to list group snapshots: " << cpp_strerror(r)
+ << dendl;
+ finish(r);
+ return;
+ }
+
+ m_snaps->insert(m_snaps->end(), snaps.begin(), snaps.end());
+ if (snaps.size() < MAX_RETURN) {
+ sort_snaps();
+ return;
+ }
+
+ m_start_after = *snaps.rbegin();
+ list_snaps();
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::sort_snaps() {
+ if (!m_try_to_sort) {
+ finish(0);
+ return;
+ }
+
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 10) << dendl;
+
+ for (const auto& snap : *m_snaps) {
+ if (m_snap_orders.find(snap.id) == m_snap_orders.end()) {
+ ldout(cct, 10) << "Missing order for snap_id=" << snap.id << dendl;
+ finish(m_fail_if_not_sorted ? -EINVAL : 0);
+ return;
+ }
+ }
+
+ std::sort(m_snaps->begin(), m_snaps->end(),
+ [this](const cls::rbd::GroupSnapshot &a,
+ const cls::rbd::GroupSnapshot &b) {
+ return this->m_snap_orders[a.id] < this->m_snap_orders[b.id];
+ });
+
+ finish(0);
+}
+
+template <typename I>
+void ListSnapshotsRequest<I>::finish(int r) {
+ auto cct = reinterpret_cast<CephContext*>(m_group_io_ctx.cct());
+ ldout(cct, 10) << "r=" << r << dendl;
+
+ m_on_finish->complete(r);
+ delete this;
+}
+
+} // namespace group
+} // namespace librbd
+
+template class librbd::group::ListSnapshotsRequest<librbd::ImageCtx>;
diff --git a/src/librbd/group/ListSnapshotsRequest.h b/src/librbd/group/ListSnapshotsRequest.h
new file mode 100644
index 00000000000..7152dd981ab
--- /dev/null
+++ b/src/librbd/group/ListSnapshotsRequest.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_GROUP_LIST_SNAPSHOTS_REQUEST_H
+#define CEPH_LIBRBD_GROUP_LIST_SNAPSHOTS_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_types.h"
+
+#include <string>
+#include <vector>
+
+class Context;
+
+namespace librbd {
+
+struct ImageCtx;
+
+namespace group {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class ListSnapshotsRequest {
+public:
+ static ListSnapshotsRequest *create(
+ librados::IoCtx &group_io_ctx, const std::string &group_id,
+ bool try_to_sort, bool fail_if_not_sorted,
+ std::vector<cls::rbd::GroupSnapshot> *snaps, Context *on_finish) {
+ return new ListSnapshotsRequest(group_io_ctx, group_id, try_to_sort,
+ fail_if_not_sorted, snaps, on_finish);
+ }
+
+ ListSnapshotsRequest(librados::IoCtx &group_io_ctx,
+ const std::string &group_id,
+ bool try_to_sort, bool fail_if_not_sorted,
+ std::vector<cls::rbd::GroupSnapshot> *snaps,
+ Context *on_finish);
+
+ void send();
+
+private:
+ /**
+ * @verbatim
+ *
+ * <start> /--------\
+ * | | | (if required. repeat if more
+ * v v | entries)
+ * LIST_SNAP_ORDERS --/
+ * | /--------\
+ * | | | (repeat if more
+ * v v | snapshots)
+ * LIST_SNAPS --------/
+ * |
+ * v
+ * SORT_SNAPS (if required)
+ * |
+ * v
+ * <finish>
+ *
+ * @endverbatim
+ */
+
+ librados::IoCtx &m_group_io_ctx;
+ std::string m_group_id;
+ bool m_try_to_sort;
+ //Fail if m_try_to_sort is true and sorting fails. Ignored if m_try_to_sort is false.
+ bool m_fail_if_not_sorted;
+ std::vector<cls::rbd::GroupSnapshot> *m_snaps;
+ std::map<std::string, uint64_t> m_snap_orders;
+ Context *m_on_finish;
+
+ cls::rbd::GroupSnapshot m_start_after;
+ std::string m_start_after_order;
+ bufferlist m_out_bl;
+
+ void list_snaps();
+ void handle_list_snaps(int r);
+
+ void list_snap_orders();
+ void handle_list_snap_orders(int r);
+
+ void sort_snaps();
+
+ void finish(int r);
+};
+
+} // namespace group
+} // namespace librbd
+
+extern template class librbd::group::ListSnapshotsRequest<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_GROUP_LIST_SNAPSHOTS_REQUEST_H
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index ed8ec9e9130..c389282c0cc 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -1456,7 +1456,8 @@ namespace librbd {
}
std::vector<group_snap_info2_t> snaps2;
- int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &snaps2);
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, true,
+ false, &snaps2);
for (const auto& snap : snaps2) {
snaps->push_back(
@@ -1473,7 +1474,8 @@ namespace librbd {
int RBD::group_snap_list2(IoCtx& group_ioctx, const char *group_name,
std::vector<group_snap_info2_t> *snaps)
{
- return librbd::api::Group<>::snap_list(group_ioctx, group_name, snaps);
+ return librbd::api::Group<>::snap_list(group_ioctx, group_name, true,
+ false, snaps);
}
int RBD::group_snap_get_info(IoCtx& group_ioctx, const char *group_name,
@@ -7322,7 +7324,8 @@ extern "C" int rbd_group_snap_list(rados_ioctx_t group_p,
}
std::vector<librbd::group_snap_info2_t> cpp_snaps;
- int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps);
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, true, false,
+ &cpp_snaps);
if (r == -ENOENT) {
*snaps_size = 0;
@@ -7372,7 +7375,8 @@ extern "C" int rbd_group_snap_list2(rados_ioctx_t group_p,
librados::IoCtx::from_rados_ioctx_t(group_p, group_ioctx);
std::vector<librbd::group_snap_info2_t> cpp_snaps;
- int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, &cpp_snaps);
+ int r = librbd::api::Group<>::snap_list(group_ioctx, group_name, true, false,
+ &cpp_snaps);
if (r < 0) {
return r;
}
diff --git a/src/librbd/migration/FileStream.cc b/src/librbd/migration/FileStream.cc
index b548de7ee37..2da9e0df5e7 100644
--- a/src/librbd/migration/FileStream.cc
+++ b/src/librbd/migration/FileStream.cc
@@ -226,6 +226,18 @@ void FileStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
#endif // BOOST_ASIO_HAS_POSIX_STREAM_DESCRIPTOR
+template <typename I>
+void FileStream<I>::list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) {
+ // TODO: list sparse extents based on SEEK_HOLE/SEEK_DATA
+ for (auto [byte_offset, byte_length] : byte_extents) {
+ sparse_extents->insert(byte_offset, byte_length,
+ {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+ }
+ on_finish->complete(0);
+}
+
} // namespace migration
} // namespace librbd
diff --git a/src/librbd/migration/FileStream.h b/src/librbd/migration/FileStream.h
index 1a7e20ac707..e6050d865cc 100644
--- a/src/librbd/migration/FileStream.h
+++ b/src/librbd/migration/FileStream.h
@@ -44,6 +44,10 @@ public:
void read(io::Extents&& byte_extents, bufferlist* data,
Context* on_finish) override;
+ void list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) override;
+
private:
CephContext* m_cct;
std::shared_ptr<AsioEngine> m_asio_engine;
diff --git a/src/librbd/migration/HttpStream.cc b/src/librbd/migration/HttpStream.cc
index fa3cc00320e..ecf771bb53c 100644
--- a/src/librbd/migration/HttpStream.cc
+++ b/src/librbd/migration/HttpStream.cc
@@ -77,6 +77,18 @@ void HttpStream<I>::read(io::Extents&& byte_extents, bufferlist* data,
m_http_client->read(std::move(byte_extents), data, on_finish);
}
+template <typename I>
+void HttpStream<I>::list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) {
+ // no sparseness information -- list the full range as DATA
+ for (auto [byte_offset, byte_length] : byte_extents) {
+ sparse_extents->insert(byte_offset, byte_length,
+ {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+ }
+ on_finish->complete(0);
+}
+
} // namespace migration
} // namespace librbd
diff --git a/src/librbd/migration/HttpStream.h b/src/librbd/migration/HttpStream.h
index 01a58371496..8606f271298 100644
--- a/src/librbd/migration/HttpStream.h
+++ b/src/librbd/migration/HttpStream.h
@@ -45,6 +45,10 @@ public:
void read(io::Extents&& byte_extents, bufferlist* data,
Context* on_finish) override;
+ void list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) override;
+
private:
using HttpResponse = boost::beast::http::response<
boost::beast::http::string_body>;
diff --git a/src/librbd/migration/NBDStream.cc b/src/librbd/migration/NBDStream.cc
new file mode 100644
index 00000000000..bf8c0c8519e
--- /dev/null
+++ b/src/librbd/migration/NBDStream.cc
@@ -0,0 +1,370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/migration/NBDStream.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "librbd/AsioEngine.h"
+#include "librbd/ImageCtx.h"
+
+#include <libnbd.h>
+
+namespace librbd {
+namespace migration {
+
+namespace {
+
+const std::string URI_KEY{"uri"};
+
+int from_nbd_errno(int rc) {
+ // nbd_get_errno() needs a default/fallback error:
+ // "Even when a call returns an error, nbd_get_errno() might return 0.
+ // This does not mean there was no error. It means no additional errno
+ // information is available for this error."
+ return rc > 0 ? -rc : -EIO;
+}
+
+int extent_cb(void* data, const char* metacontext, uint64_t offset,
+ uint32_t* entries, size_t nr_entries, int* error) {
+ auto sparse_extents = reinterpret_cast<io::SparseExtents*>(data);
+
+ // "[...] always check the metacontext field to ensure you are
+ // receiving the data you expect."
+ if (strcmp(metacontext, LIBNBD_CONTEXT_BASE_ALLOCATION) == 0) {
+ for (size_t i = 0; i < nr_entries; i += 2) {
+ auto length = entries[i];
+ auto state = entries[i + 1];
+ if (length > 0 && state & (LIBNBD_STATE_HOLE | LIBNBD_STATE_ZERO)) {
+ sparse_extents->insert(offset, length,
+ {io::SPARSE_EXTENT_STATE_ZEROED, length});
+ }
+ offset += length;
+ }
+ }
+
+ return 1;
+}
+
+} // anonymous namespace
+
+template <typename>
+class NBDClient {
+public:
+ static NBDClient* create() {
+ return new NBDClient();
+ }
+
+ const char* get_error() {
+ return nbd_get_error();
+ }
+
+ int get_errno() {
+ return nbd_get_errno();
+ }
+
+ int init() {
+ m_handle.reset(nbd_create());
+ return m_handle != nullptr ? 0 : -1;
+ }
+
+ int add_meta_context(const char* name) {
+ return nbd_add_meta_context(m_handle.get(), name);
+ }
+
+ int connect_uri(const char* uri) {
+ return nbd_connect_uri(m_handle.get(), uri);
+ }
+
+ int64_t get_size() {
+ return nbd_get_size(m_handle.get());
+ }
+
+ int pread(void* buf, size_t count, uint64_t offset, uint32_t flags) {
+ return nbd_pread(m_handle.get(), buf, count, offset, flags);
+ }
+
+ int block_status(uint64_t count, uint64_t offset,
+ nbd_extent_callback extent_callback, uint32_t flags) {
+ return nbd_block_status(m_handle.get(), count, offset, extent_callback,
+ flags);
+ }
+
+ int shutdown(uint32_t flags) {
+ return nbd_shutdown(m_handle.get(), flags);
+ }
+
+private:
+ struct nbd_handle_deleter {
+ void operator()(nbd_handle* h) {
+ nbd_close(h);
+ }
+ };
+ std::unique_ptr<nbd_handle, nbd_handle_deleter> m_handle;
+};
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NBDStream::ReadRequest: " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+struct NBDStream<I>::ReadRequest {
+ NBDStream* nbd_stream;
+ io::Extents byte_extents;
+ bufferlist* data;
+ Context* on_finish;
+ size_t index = 0;
+
+ ReadRequest(NBDStream* nbd_stream, io::Extents&& byte_extents,
+ bufferlist* data, Context* on_finish)
+ : nbd_stream(nbd_stream), byte_extents(std::move(byte_extents)),
+ data(data), on_finish(on_finish) {
+ auto cct = nbd_stream->m_cct;
+ ldout(cct, 20) << dendl;
+ }
+
+ void send() {
+ data->clear();
+ read();
+ }
+
+ void read() {
+ if (index >= byte_extents.size()) {
+ finish(0);
+ return;
+ }
+
+ auto cct = nbd_stream->m_cct;
+ auto [byte_offset, byte_length] = byte_extents[index++];
+ ldout(cct, 20) << "byte_offset=" << byte_offset << " byte_length="
+ << byte_length << dendl;
+
+ auto& nbd_client = nbd_stream->m_nbd_client;
+ auto ptr = buffer::ptr_node::create(buffer::create_small_page_aligned(
+ byte_length));
+ int rc = nbd_client->pread(ptr->c_str(), byte_length, byte_offset, 0);
+ if (rc == -1) {
+ rc = nbd_client->get_errno();
+ lderr(cct) << "pread " << byte_offset << "~" << byte_length << ": "
+ << nbd_client->get_error() << " (errno = " << rc << ")"
+ << dendl;
+ finish(from_nbd_errno(rc));
+ return;
+ }
+
+ data->push_back(std::move(ptr));
+ boost::asio::post(nbd_stream->m_strand, [this] { read(); });
+ }
+
+ void finish(int r) {
+ auto cct = nbd_stream->m_cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ if (r < 0) {
+ data->clear();
+ }
+
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NBDStream::ListSparseExtentsRequest: " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+struct NBDStream<I>::ListSparseExtentsRequest {
+ NBDStream* nbd_stream;
+ io::Extents byte_extents;
+ io::SparseExtents* sparse_extents;
+ Context* on_finish;
+ size_t index = 0;
+
+ ListSparseExtentsRequest(NBDStream* nbd_stream, io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents, Context* on_finish)
+ : nbd_stream(nbd_stream), byte_extents(std::move(byte_extents)),
+ sparse_extents(sparse_extents), on_finish(on_finish) {
+ auto cct = nbd_stream->m_cct;
+ ldout(cct, 20) << dendl;
+ }
+
+ void send() {
+ list_sparse_extents();
+ }
+
+ void list_sparse_extents() {
+ if (index >= byte_extents.size()) {
+ finish(0);
+ return;
+ }
+
+ auto cct = nbd_stream->m_cct;
+ auto [byte_offset, byte_length] = byte_extents[index++];
+ ldout(cct, 20) << "byte_offset=" << byte_offset << " byte_length="
+ << byte_length << dendl;
+
+ // nbd_block_status() is specified to be really loose:
+ // "The count parameter is a hint: the server may choose to
+ // return less status, or the final block may extend beyond the
+ // requested range. [...] It is possible for the extent function
+ // to be called more times than you expect [...] It is also
+ // possible that the extent function is not called at all, even
+ // for metadata contexts that you requested."
+ io::SparseExtents tmp_sparse_extents;
+ tmp_sparse_extents.insert(byte_offset, byte_length,
+ {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+
+ auto& nbd_client = nbd_stream->m_nbd_client;
+ int rc = nbd_client->block_status(byte_length, byte_offset,
+ {extent_cb, &tmp_sparse_extents}, 0);
+ if (rc == -1) {
+ rc = nbd_client->get_errno();
+ lderr(cct) << "block_status " << byte_offset << "~" << byte_length << ": "
+ << nbd_client->get_error() << " (errno = " << rc << ")"
+ << dendl;
+ // don't propagate errors -- we are set up to list any missing
+ // parts of the range as DATA if nbd_block_status() returns less
+ // status or none at all
+ }
+
+ // trim the result in case more status was returned
+ sparse_extents->insert(tmp_sparse_extents.intersect(byte_offset,
+ byte_length));
+
+ boost::asio::post(nbd_stream->m_strand, [this] { list_sparse_extents(); });
+ }
+
+ void finish(int r) {
+ auto cct = nbd_stream->m_cct;
+ ldout(cct, 20) << "r=" << r << dendl;
+
+ on_finish->complete(r);
+ delete this;
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::migration::NBDStream: " \
+ << this << " " << __func__ << ": "
+
+template <typename I>
+NBDStream<I>::NBDStream(I* image_ctx, const json_spirit::mObject& json_object)
+ : m_cct(image_ctx->cct), m_asio_engine(image_ctx->asio_engine),
+ m_json_object(json_object),
+ m_strand(boost::asio::make_strand(*m_asio_engine)) {
+}
+
+template <typename I>
+NBDStream<I>::~NBDStream() {
+}
+
+template <typename I>
+void NBDStream<I>::open(Context* on_finish) {
+ std::string uri;
+ int rc;
+
+ if (auto it = m_json_object.find(URI_KEY);
+ it != m_json_object.end()) {
+ if (it->second.type() == json_spirit::str_type) {
+ uri = it->second.get_str();
+ } else {
+ lderr(m_cct) << "invalid URI" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+ } else {
+ lderr(m_cct) << "missing URI" << dendl;
+ on_finish->complete(-EINVAL);
+ return;
+ }
+
+ ldout(m_cct, 10) << "uri=" << uri << dendl;
+
+ m_nbd_client.reset(NBDClient<I>::create());
+ rc = m_nbd_client->init();
+ if (rc == -1) {
+ rc = m_nbd_client->get_errno();
+ lderr(m_cct) << "init: " << m_nbd_client->get_error()
+ << " (errno = " << rc << ")" << dendl;
+ on_finish->complete(from_nbd_errno(rc));
+ return;
+ }
+
+ rc = m_nbd_client->add_meta_context(LIBNBD_CONTEXT_BASE_ALLOCATION);
+ if (rc == -1) {
+ rc = m_nbd_client->get_errno();
+ lderr(m_cct) << "add_meta_context: " << m_nbd_client->get_error()
+ << " (errno = " << rc << ")" << dendl;
+ on_finish->complete(from_nbd_errno(rc));
+ return;
+ }
+
+ rc = m_nbd_client->connect_uri(uri.c_str());
+ if (rc == -1) {
+ rc = m_nbd_client->get_errno();
+ lderr(m_cct) << "connect_uri: " << m_nbd_client->get_error()
+ << " (errno = " << rc << ")" << dendl;
+ on_finish->complete(from_nbd_errno(rc));
+ return;
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NBDStream<I>::close(Context* on_finish) {
+ ldout(m_cct, 20) << dendl;
+
+ if (m_nbd_client != nullptr) {
+ // send a graceful shutdown to the server
+ // ignore errors -- we are read-only, also from the client's
+ // POV there is no disadvantage to abruptly closing the socket
+ // in nbd_close()
+ m_nbd_client->shutdown(0);
+ m_nbd_client.reset();
+ }
+
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NBDStream<I>::get_size(uint64_t* size, Context* on_finish) {
+ ldout(m_cct, 20) << dendl;
+
+ int64_t rc = m_nbd_client->get_size();
+ if (rc == -1) {
+ rc = m_nbd_client->get_errno();
+ lderr(m_cct) << "get_size: " << m_nbd_client->get_error()
+ << " (errno = " << rc << ")" << dendl;
+ on_finish->complete(from_nbd_errno(rc));
+ return;
+ }
+
+ *size = rc;
+ on_finish->complete(0);
+}
+
+template <typename I>
+void NBDStream<I>::read(io::Extents&& byte_extents,
+ bufferlist* data,
+ Context* on_finish) {
+ ldout(m_cct, 20) << byte_extents << dendl;
+ auto ctx = new ReadRequest(this, std::move(byte_extents), data, on_finish);
+ boost::asio::post(m_strand, [ctx] { ctx->send(); });
+}
+
+template <typename I>
+void NBDStream<I>::list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) {
+ ldout(m_cct, 20) << byte_extents << dendl;
+ auto ctx = new ListSparseExtentsRequest(this, std::move(byte_extents),
+ sparse_extents, on_finish);
+ boost::asio::post(m_strand, [ctx] { ctx->send(); });
+}
+
+} // namespace migration
+} // namespace librbd
+
+template class librbd::migration::NBDStream<librbd::ImageCtx>;
diff --git a/src/librbd/migration/NBDStream.h b/src/librbd/migration/NBDStream.h
new file mode 100644
index 00000000000..aeced5d4f3d
--- /dev/null
+++ b/src/librbd/migration/NBDStream.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_MIGRATION_NBD_STREAM_H
+#define CEPH_LIBRBD_MIGRATION_NBD_STREAM_H
+
+#include "include/int_types.h"
+#include "librbd/migration/StreamInterface.h"
+#include <json_spirit/json_spirit.h>
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/strand.hpp>
+
+struct Context;
+
+namespace librbd {
+
+struct AsioEngine;
+struct ImageCtx;
+
+namespace migration {
+
+template <typename> class NBDClient;
+
+template <typename ImageCtxT>
+class NBDStream : public StreamInterface {
+public:
+ static NBDStream* create(ImageCtxT* image_ctx,
+ const json_spirit::mObject& json_object) {
+ return new NBDStream(image_ctx, json_object);
+ }
+
+ NBDStream(ImageCtxT* image_ctx, const json_spirit::mObject& json_object);
+ ~NBDStream() override;
+
+ NBDStream(const NBDStream&) = delete;
+ NBDStream& operator=(const NBDStream&) = delete;
+
+ void open(Context* on_finish) override;
+ void close(Context* on_finish) override;
+
+ void get_size(uint64_t* size, Context* on_finish) override;
+
+ void read(io::Extents&& byte_extents, bufferlist* data,
+ Context* on_finish) override;
+
+ void list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) override;
+
+private:
+ CephContext* m_cct;
+ std::shared_ptr<AsioEngine> m_asio_engine;
+ json_spirit::mObject m_json_object;
+ boost::asio::strand<boost::asio::io_context::executor_type> m_strand;
+
+ std::unique_ptr<NBDClient<ImageCtxT>> m_nbd_client;
+
+ struct ReadRequest;
+ struct ListSparseExtentsRequest;
+};
+
+} // namespace migration
+} // namespace librbd
+
+extern template class librbd::migration::NBDStream<librbd::ImageCtx>;
+
+#endif // CEPH_LIBRBD_MIGRATION_NBD_STREAM_H
diff --git a/src/librbd/migration/RawSnapshot.cc b/src/librbd/migration/RawSnapshot.cc
index f56d518c3e1..ce280f8f16e 100644
--- a/src/librbd/migration/RawSnapshot.cc
+++ b/src/librbd/migration/RawSnapshot.cc
@@ -205,13 +205,9 @@ void RawSnapshot<I>::list_snap(io::Extents&& image_extents,
auto cct = m_image_ctx->cct;
ldout(cct, 20) << "image_extents=" << image_extents << dendl;
- // raw does support sparse extents so list the full IO extent as a delta
- for (auto& [image_offset, image_length] : image_extents) {
- sparse_extents->insert(image_offset, image_length,
- {io::SPARSE_EXTENT_STATE_DATA, image_length});
- }
-
- on_finish->complete(0);
+ // raw directly maps the image-extent IO down to a byte IO extent
+ m_stream->list_sparse_extents(std::move(image_extents), sparse_extents,
+ on_finish);
}
} // namespace migration
diff --git a/src/librbd/migration/S3Stream.cc b/src/librbd/migration/S3Stream.cc
index a611e274ae1..b53e821991a 100644
--- a/src/librbd/migration/S3Stream.cc
+++ b/src/librbd/migration/S3Stream.cc
@@ -194,6 +194,18 @@ void S3Stream<I>::process_request(HttpRequest& http_request) {
<< "authorization=" << authorization << dendl;
}
+template <typename I>
+void S3Stream<I>::list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) {
+ // no sparseness information -- list the full range as DATA
+ for (auto [byte_offset, byte_length] : byte_extents) {
+ sparse_extents->insert(byte_offset, byte_length,
+ {io::SPARSE_EXTENT_STATE_DATA, byte_length});
+ }
+ on_finish->complete(0);
+}
+
} // namespace migration
} // namespace librbd
diff --git a/src/librbd/migration/S3Stream.h b/src/librbd/migration/S3Stream.h
index 586b217878c..1c2927ad1e1 100644
--- a/src/librbd/migration/S3Stream.h
+++ b/src/librbd/migration/S3Stream.h
@@ -46,6 +46,10 @@ public:
void read(io::Extents&& byte_extents, bufferlist* data,
Context* on_finish) override;
+ void list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) override;
+
private:
using HttpRequest = boost::beast::http::request<
boost::beast::http::empty_body>;
diff --git a/src/librbd/migration/SourceSpecBuilder.cc b/src/librbd/migration/SourceSpecBuilder.cc
index f44d97d58f1..78937db99b3 100644
--- a/src/librbd/migration/SourceSpecBuilder.cc
+++ b/src/librbd/migration/SourceSpecBuilder.cc
@@ -7,6 +7,9 @@
#include "librbd/migration/FileStream.h"
#include "librbd/migration/HttpStream.h"
#include "librbd/migration/S3Stream.h"
+#if defined(HAVE_LIBNBD)
+#include "librbd/migration/NBDStream.h"
+#endif
#include "librbd/migration/NativeFormat.h"
#include "librbd/migration/QCOWFormat.h"
#include "librbd/migration/RawFormat.h"
@@ -125,6 +128,10 @@ int SourceSpecBuilder<I>::build_stream(
stream->reset(HttpStream<I>::create(m_image_ctx, stream_obj));
} else if (type == "s3") {
stream->reset(S3Stream<I>::create(m_image_ctx, stream_obj));
+#if defined(HAVE_LIBNBD)
+ } else if (type == "nbd") {
+ stream->reset(NBDStream<I>::create(m_image_ctx, stream_obj));
+#endif
} else {
lderr(cct) << "unknown or unsupported stream type '" << type << "'"
<< dendl;
diff --git a/src/librbd/migration/StreamInterface.h b/src/librbd/migration/StreamInterface.h
index 782a9a5f8d5..52ded94ccad 100644
--- a/src/librbd/migration/StreamInterface.h
+++ b/src/librbd/migration/StreamInterface.h
@@ -24,6 +24,10 @@ struct StreamInterface {
virtual void read(io::Extents&& byte_extents, bufferlist* data,
Context* on_finish) = 0;
+
+ virtual void list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) = 0;
};
} // namespace migration
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 89e2e5e4be9..f000da7928a 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -3461,16 +3461,23 @@ bool CDir::can_auth_pin(int *err_ret) const
{
int err;
if (!is_auth()) {
+ dout(20) << __func__ << ": error - no auth" << dendl;
err = ERR_NOT_AUTH;
} else if (is_freezing_dir() || is_frozen_dir()) {
+ dout(20) << __func__ << ": error - fragmenting dir ("
+ << (is_freezing_dir() ? "freezing" : "frozen")
+ << ")" << dendl;
err = ERR_FRAGMENTING_DIR;
} else {
auto p = is_freezing_or_frozen_tree();
if (p.first) {
+ dout(20) << __func__ << ": error - exporting tree" << dendl;
err = ERR_EXPORTING_TREE;
} else if (p.second) {
+ dout(20) << __func__ << ": error - exporting tree" << dendl;
err = ERR_EXPORTING_TREE;
} else {
+ dout(20) << __func__ << ": auth!" << dendl;
err = 0;
}
}
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 46aff12673e..faf9f408688 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -2987,12 +2987,15 @@ void CInode::clear_ambiguous_auth()
bool CInode::can_auth_pin(int *err_ret) const {
int err;
if (!is_auth()) {
+ dout(20) << __func__ << ": error - no auth" << dendl;
err = ERR_NOT_AUTH;
} else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
+ dout(20) << __func__ << ": error - exporting inode" << dendl;
err = ERR_EXPORTING_INODE;
} else {
if (parent)
return parent->can_auth_pin(err_ret);
+ dout(20) << __func__ << ": auth!" << dendl;
err = 0;
}
if (err && err_ret)
@@ -5281,6 +5284,7 @@ void CInode::scrub_maybe_delete_info()
{
if (scrub_infop &&
!scrub_infop->scrub_in_progress &&
+ !scrub_infop->uninline_in_progress &&
!scrub_infop->last_scrub_dirty) {
scrub_infop.reset();
}
@@ -5292,30 +5296,52 @@ void CInode::scrub_initialize(ScrubHeaderRef& header)
scrub_info();
scrub_infop->scrub_in_progress = true;
+ scrub_infop->uninline_in_progress = false;
scrub_infop->queued_frags.clear();
scrub_infop->header = header;
header->inc_num_pending();
// right now we don't handle remote inodes
}
+void CInode::uninline_initialize()
+{
+ dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
+ scrub_infop->uninline_in_progress = true;
+}
+
void CInode::scrub_aborted() {
dout(20) << __func__ << dendl;
ceph_assert(scrub_is_in_progress());
scrub_infop->scrub_in_progress = false;
+ scrub_infop->uninline_in_progress = false;
scrub_infop->header->dec_num_pending();
scrub_maybe_delete_info();
}
+void CInode::common_finished() {
+ if (!scrub_is_in_progress()) {
+ scrub_infop->last_scrub_version = get_version();
+ scrub_infop->last_scrub_stamp = ceph_clock_now();
+ scrub_infop->last_scrub_dirty = true;
+ scrub_infop->header->dec_num_pending();
+ }
+}
+
void CInode::scrub_finished() {
dout(20) << __func__ << dendl;
ceph_assert(scrub_is_in_progress());
- scrub_infop->last_scrub_version = get_version();
- scrub_infop->last_scrub_stamp = ceph_clock_now();
- scrub_infop->last_scrub_dirty = true;
scrub_infop->scrub_in_progress = false;
- scrub_infop->header->dec_num_pending();
+ common_finished();
+}
+
+void CInode::uninline_finished() {
+ dout(20) << __func__ << dendl;
+ ceph_assert(scrub_is_in_progress());
+
+ scrub_infop->uninline_in_progress = false;
+ common_finished();
}
int64_t CInode::get_backtrace_pool() const
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index cf2322998e3..d55b6442107 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -305,6 +305,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
bool scrub_in_progress = false; /// are we currently scrubbing?
+ bool uninline_in_progress = false; /// are we currently uninlining?
fragset_t queued_frags;
@@ -437,7 +438,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
}
bool scrub_is_in_progress() const {
- return (scrub_infop && scrub_infop->scrub_in_progress);
+ return (scrub_infop && (scrub_infop->scrub_in_progress || scrub_infop->uninline_in_progress));
}
/**
* Start scrubbing on this inode. That could be very short if it's
@@ -448,6 +449,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
* directory's get_projected_version())
*/
void scrub_initialize(ScrubHeaderRef& header);
+ void uninline_initialize();
/**
* Call this once the scrub has been completed, whether it's a full
* recursive scrub on a directory or simply the data on a file (or
@@ -456,6 +458,8 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
* be complete()ed.
*/
void scrub_finished();
+ void uninline_finished();
+ void common_finished();
void scrub_aborted();
@@ -1058,6 +1062,15 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
MDSContext *fin);
static void dump_validation_results(const validated_data& results,
ceph::Formatter *f);
+ bool has_inline_data() {
+ if (is_normal() && is_file()) {
+ auto pin = get_projected_inode();
+ if (pin->inline_data.version != CEPH_INLINE_NONE) {
+ return true;
+ }
+ }
+ return false;
+ }
//bool hack_accessed = false;
//utime_t hack_load_stamp;
diff --git a/src/mds/DamageTable.cc b/src/mds/DamageTable.cc
index 2079d23333a..4b9cba199da 100644
--- a/src/mds/DamageTable.cc
+++ b/src/mds/DamageTable.cc
@@ -123,6 +123,41 @@ class BacktraceDamage : public DamageEntry
f->close_section();
}
};
+
+/**
+ * Record about Uninline failures during scrub
+ */
+class UninlineDamage : public DamageEntry
+{
+ public:
+ inodeno_t ino;
+ mds_rank_t rank;
+ int32_t failure_errno;
+ std::string scrub_tag;
+
+ UninlineDamage(
+ inodeno_t ino_, mds_rank_t rank_, int32_t errno_, std::string_view scrub_tag_)
+ : ino(ino_), rank(rank_), failure_errno(errno_), scrub_tag(scrub_tag_)
+ {}
+
+ damage_entry_type_t get_type() const override
+ {
+ return DAMAGE_ENTRY_UNINLINE_FILE;
+ }
+
+ void dump(Formatter *f) const override
+ {
+ f->open_object_section("uninline_damage");
+ f->dump_string("damage_type", "uninline");
+ f->dump_int("id", id);
+ f->dump_int("ino", ino);
+ f->dump_int("rank", rank);
+ f->dump_string("errno", cpp_strerror(failure_errno));
+ f->dump_string("scrub_tag", scrub_tag);
+ f->dump_string("path", path);
+ f->close_section();
+ }
+};
}
DamageEntry::~DamageEntry()
@@ -228,6 +263,27 @@ void DamageTable::remove_backtrace_damage_entry(inodeno_t ino)
}
}
+bool DamageTable::notify_uninline_failed(
+ inodeno_t ino,
+ mds_rank_t rank,
+ int32_t failure_errno,
+ std::string_view scrub_tag,
+ std::string_view path)
+{
+ if (oversized()) {
+ return true;
+ }
+
+ if (auto [it, inserted] = uninline_failures.try_emplace(ino); inserted) {
+ auto entry = std::make_shared<UninlineDamage>(ino, rank, errno, scrub_tag);
+ entry->path = path;
+ it->second = entry;
+ by_id[entry->id] = std::move(entry);
+ }
+
+ return false;
+}
+
bool DamageTable::oversized() const
{
return by_id.size() > (size_t)(g_conf()->mds_damage_table_max_entries);
@@ -293,6 +349,9 @@ void DamageTable::erase(damage_entry_id_t damage_id)
} else if (type == DAMAGE_ENTRY_BACKTRACE) {
auto backtrace_entry = std::static_pointer_cast<BacktraceDamage>(entry);
remotes.erase(backtrace_entry->ino);
+ } else if (type == DAMAGE_ENTRY_UNINLINE_FILE) {
+ auto uninline_entry = std::static_pointer_cast<UninlineDamage>(entry);
+ uninline_failures.erase(uninline_entry->ino);
} else {
derr << "Invalid type " << type << dendl;
ceph_abort();
diff --git a/src/mds/DamageTable.h b/src/mds/DamageTable.h
index a1b96fe2218..a8182491a9f 100644
--- a/src/mds/DamageTable.h
+++ b/src/mds/DamageTable.h
@@ -30,7 +30,8 @@ typedef enum
{
DAMAGE_ENTRY_DIRFRAG,
DAMAGE_ENTRY_DENTRY,
- DAMAGE_ENTRY_BACKTRACE
+ DAMAGE_ENTRY_BACKTRACE,
+ DAMAGE_ENTRY_UNINLINE_FILE
} damage_entry_type_t;
@@ -162,6 +163,16 @@ class DamageTable
void remove_backtrace_damage_entry(inodeno_t ino);
+ /**
+ * Indicate that there was some error when attempting to unline data of
+ * the file.
+ *
+ * @return true if fatal
+ */
+ bool notify_uninline_failed(
+ inodeno_t ino, mds_rank_t rank, int32_t failure_errno,
+ std::string_view scrub_tag, std::string_view path);
+
bool is_dentry_damaged(
const CDir *dir_frag,
std::string_view dname,
@@ -194,6 +205,9 @@ class DamageTable
// (i.e. have probably/possibly missing backtraces)
std::map<inodeno_t, DamageEntryRef> remotes;
+ // Map of all inodes for which Data Uninlining failed
+ std::map<inodeno_t, DamageEntryRef> uninline_failures;
+
// All damage, by ID. This is a secondary index
// to the dirfrag, dentry, remote maps. It exists
// to enable external tools to unambiguously operate
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index b5e28140c32..c433c77b453 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -9897,6 +9897,9 @@ void MDCache::dispatch_request(const MDRequestRef& mdr)
case CEPH_MDS_OP_RDLOCK_FRAGSSTATS:
rdlock_dirfrags_stats_work(mdr);
break;
+ case CEPH_MDS_OP_UNINLINE_DATA:
+ uninline_data_work(mdr);
+ break;
default:
ceph_abort();
}
@@ -13175,6 +13178,190 @@ void MDCache::enqueue_scrub_work(const MDRequestRef& mdr)
mds->server->respond_to_request(mdr, r);
}
+class C_MDC_DataUninlinedSubmitted : public MDCacheLogContext {
+ MDRequestRef mdr;
+
+ public:
+ C_MDC_DataUninlinedSubmitted(MDRequestRef r, MDSRank *mds) :
+ MDCacheLogContext(mds->mdcache), mdr(r) {}
+
+ void finish(int r) {
+ auto mds = get_mds(); // to keep dout happy
+ auto in = mds->server->rdlock_path_pin_ref(mdr, true);
+
+ ceph_assert(in != nullptr);
+
+ dout(20) << "(uninline_data) log submission "
+ << (r ? "failed" : "succeeded")
+ << "; r=" << r
+ << " (" << cpp_strerror(r) << ") for " << *in << dendl;
+
+ // journaling must not fail
+ ceph_assert(r == 0);
+
+ in->mdcache->logger->inc(l_mdc_uninline_succeeded);
+ auto h = in->get_scrub_header();
+ h->record_uninline_passed();
+ in->uninline_finished();
+ mdr->apply();
+ mds->server->respond_to_request(mdr, r);
+ }
+};
+
+struct C_IO_DataUninlined : public MDSIOContext {
+ MDRequestRef mdr;
+
+ public:
+ C_IO_DataUninlined(MDRequestRef r, MDSRank *mds) : MDSIOContext(mds), mdr(r) {}
+
+ virtual void print(std::ostream& os) const {
+ os << "data uninlined";
+ }
+
+ void finish(int r) override {
+ auto mds = get_mds(); // to keep dout/derr happy
+ auto in = mds->server->rdlock_path_pin_ref(mdr, true);
+
+ // return faster if operation has failed (non-zero) status
+ if (r) {
+ derr << "(uninline_data) mutation failed: r=" << r
+ << " (" << cpp_strerror(r) << ") for " << *in << dendl;
+ in->mdcache->logger->inc(l_mdc_uninline_write_failed);
+ ceph_assert(in->get_scrub_header());
+ auto h = in->get_scrub_header();
+ h->record_uninline_failed();
+ std::string path;
+ in->make_path_string(path);
+ h->record_uninline_status(in->ino(), r, path);
+ in->uninline_finished();
+ mds->server->respond_to_request(mdr, r);
+ return;
+ }
+
+ dout(20) << "(uninline_data) mutation succeeded for " << *in << dendl;
+
+ // journal the inode changes
+ MDLog *mdlog = mds->mdlog;
+
+ dout(20) << "(uninline_data) writing to journal for " << *in << dendl;
+
+ EUpdate *le = new EUpdate(mdlog, "uninline");
+ mdr->ls = mdlog->get_current_segment();
+
+ auto pi = in->project_inode(mdr);
+ pi.inode->version = in->pre_dirty();
+ pi.inode->inline_data.free_data();
+ pi.inode->inline_data.version = CEPH_INLINE_NONE;
+ pi.inode->ctime = mdr->get_op_stamp();
+ if (mdr->get_op_stamp() > pi.inode->rstat.rctime) {
+ pi.inode->rstat.rctime = mdr->get_op_stamp();
+ }
+ pi.inode->change_attr++;
+
+ in->mdcache->predirty_journal_parents(mdr, &le->metablob, in, nullptr,
+ PREDIRTY_PRIMARY);
+ in->mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
+
+ mdr->committing = true;
+
+ string event_str("submit entry: ");
+ event_str += __func__;
+ mdr->mark_event(event_str);
+
+ auto fin = new C_MDC_DataUninlinedSubmitted(mdr, mds);
+ mdlog->submit_entry(le, fin);
+ }
+};
+
+void MDCache::uninline_data_work(MDRequestRef mdr)
+{
+ CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
+
+ if (!in) {
+ return;
+ }
+
+ MutationImpl::LockOpVec lov;
+ lov.add_xlock(&in->authlock);
+ lov.add_xlock(&in->filelock);
+ lov.add_xlock(&in->versionlock);
+
+ if (!mds->locker->acquire_locks(mdr, lov)) {
+ dout(20) << "(uninline_data) acquire_locks failed; will retry later for " << *in << dendl;
+ return; // lock not available immediately
+ }
+
+ if (!in->has_inline_data()) {
+ dout(20) << "(uninline_data) inode doesn't have inline data anymore " << *in << dendl;
+ in->uninline_finished();
+ mds->server->respond_to_request(mdr, 0);
+ return;
+ }
+ if (MDS_INO_IS_MDSDIR(in->get_scrub_header()->get_origin())) {
+ in->get_scrub_header()->record_uninline_skipped();
+ mds->server->respond_to_request(mdr, 0);
+ return;
+ }
+
+ logger->inc(l_mdc_uninline_started);
+ auto h = in->get_scrub_header();
+ h->record_uninline_started();
+ in->uninline_initialize();
+
+ auto ino = [&]() { return in->ino(); };
+ auto pi = in->get_projected_inode();
+ auto objecter = mds->objecter;
+
+ dout(20) << "(uninline_data) testing inline_data.version for " << *in << dendl;
+ ceph_assert(objecter);
+ ceph_assert(pi->inline_data.version != CEPH_INLINE_NONE);
+
+ object_t oid = InodeStoreBase::get_object_name(ino(), frag_t(), "");
+ SnapContext snapc;
+ SnapRealm *snaprealm = in->find_snaprealm();
+ auto& snapc_ref = (snaprealm ? snaprealm->get_snap_context() : snapc);
+
+ ObjectOperation create_ops;
+ create_ops.create(false);
+
+ dout(20) << "(uninline_data) dispatching objecter to create \""
+ << mdr->get_filepath() << "\" for " << *in << dendl;
+
+ objecter->mutate(oid,
+ OSDMap::file_to_object_locator(pi->layout),
+ create_ops,
+ snapc_ref,
+ ceph::real_clock::now(),
+ 0,
+ nullptr);
+
+ bufferlist inline_version_bl;
+
+ in->encode(inline_version_bl, pi->inline_data.version);
+
+ ObjectOperation uninline_ops;
+ uninline_ops.cmpxattr("inline_version",
+ CEPH_OSD_CMPXATTR_OP_GT,
+ CEPH_OSD_CMPXATTR_MODE_U64,
+ inline_version_bl);
+
+ if (pi->inline_data.length() > 0) {
+ dout(10) << "(uninline_data) moving inline data for \"" << mdr->get_filepath() << "\" to file for " << *in << dendl;
+ bufferlist inline_data;
+ pi->inline_data.get_data(inline_data);
+ uninline_ops.write(0, inline_data, pi->truncate_size, pi->truncate_seq);
+ }
+ uninline_ops.setxattr("inline_version", std::to_string(CEPH_INLINE_NONE));
+
+ objecter->mutate(oid,
+ OSDMap::file_to_object_locator(pi->layout),
+ uninline_ops,
+ snapc_ref,
+ ceph::real_clock::now(),
+ 0,
+ new C_IO_DataUninlined(mdr, mds));
+}
+
struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
MDRequestRef mdr;
C_MDC_RespondInternalRequest(MDCache *c, const MDRequestRef& m) :
@@ -13548,6 +13735,14 @@ void MDCache::register_perfcounters()
pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
"Internal Request type inode stats");
+ // uninline op stats
+ pcb.add_u64_counter(l_mdc_uninline_started, "uninline_started",
+ "Internal Counter type uninline started");
+ pcb.add_u64_counter(l_mdc_uninline_succeeded, "uninline_succeeded",
+ "Internal Counter type uninline succeeded");
+ pcb.add_u64_counter(l_mdc_uninline_write_failed, "uninline_write_failed",
+ "Internal Counter type uninline write failed");
+
logger.reset(pcb.create_perf_counters());
g_ceph_context->get_perfcounters_collection()->add(logger.get());
recovery_queue.set_logger(logger.get());
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 8ddadcd716a..0b01c9ab859 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -118,6 +118,10 @@ enum {
l_mdss_ireq_fragstats,
l_mdss_ireq_inodestats,
+ l_mdc_uninline_started,
+ l_mdc_uninline_succeeded,
+ l_mdc_uninline_write_failed,
+
l_mdc_last,
};
@@ -1094,6 +1098,8 @@ private:
void repair_dirfrag_stats(CDir *dir);
void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin);
+ void uninline_data_work(MDRequestRef mdr);
+
// my leader
MDSRank *mds;
@@ -1442,6 +1448,8 @@ private:
friend class C_MDC_FragmentCommit;
friend class C_MDC_FragmentRollback;
friend class C_IO_MDC_FragmentPurgeOld;
+ friend class C_IO_DataUninlined;
+ friend class C_MDC_DataUninlinedSubmitted;
// -- subtrees --
static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index b31d9c95220..75b608ace77 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -334,6 +334,11 @@ void MDSDaemon::set_up_admin_socket()
asok_hook,
"Status of scrub operations(s)");
ceph_assert(r == 0);
+ r = admin_socket->register_command("scrub purge_status "
+ "name=tag,type=CephString,req=true",
+ asok_hook,
+ "Purge status of scrub tag|all");
+ ceph_assert(r == 0);
r = admin_socket->register_command("tag path name=path,type=CephString"
" name=tag,type=CephString",
asok_hook,
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 1cd742423e6..50c7175df80 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -782,8 +782,10 @@ void MDSRankDispatcher::tick()
}
}
- if (whoami == 0)
+ if (whoami == 0) {
scrubstack->advance_scrub_status();
+ scrubstack->purge_old_scrub_counters();
+ }
}
if (is_active() || is_stopping()) {
@@ -2965,6 +2967,15 @@ void MDSRankDispatcher::handle_asok_command(
command_scrub_resume(f);
} else if (command == "scrub status") {
command_scrub_status(f);
+ } else if (command == "scrub purge_status") {
+ if (whoami != 0) {
+ *css << "Not rank 0";
+ r = -CEPHFS_EXDEV;
+ goto out;
+ }
+ string tag;
+ cmd_getval(cmdmap, "tag", tag);
+ command_scrub_purge_status(tag);
} else if (command == "tag path") {
if (whoami != 0) {
*css << "Not rank 0";
@@ -3225,6 +3236,11 @@ void MDSRank::command_scrub_status(Formatter *f) {
scrubstack->scrub_status(f);
}
+void MDSRank::command_scrub_purge_status(std::string_view tag) {
+ std::lock_guard l(mds_lock);
+ scrubstack->purge_scrub_counters(tag);
+}
+
void MDSRank::command_get_subtrees(Formatter *f)
{
ceph_assert(f != NULL);
@@ -4233,6 +4249,7 @@ void MDSRankDispatcher::handle_conf_change(const ConfigProxy& conf, const std::s
mdcache->handle_conf_change(changed, *mdsmap);
mdlog->handle_conf_change(changed, *mdsmap);
purge_queue.handle_conf_change(changed, *mdsmap);
+ scrubstack->handle_conf_change(changed);
}));
}
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
index c4a8809b6e1..9135db40c09 100644
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -509,6 +509,7 @@ class MDSRank {
// scrub control commands
void command_scrub_resume(Formatter *f);
void command_scrub_status(Formatter *f);
+ void command_scrub_purge_status(std::string_view tag);
void command_get_subtrees(Formatter *f);
void command_export_dir(Formatter *f,
diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h
index a5d35f61ce4..5f112387fec 100644
--- a/src/mds/ScrubHeader.h
+++ b/src/mds/ScrubHeader.h
@@ -27,6 +27,7 @@ class Formatter;
};
class CInode;
+class MDCache;
/**
* Externally input parameters for a scrub, associated with the root
@@ -64,6 +65,48 @@ public:
}
unsigned get_num_pending() const { return num_pending; }
+ void record_uninline_status(_inodeno_t ino, int e, std::string_view path) {
+ if (uninline_failed_info.find(e) == uninline_failed_info.end()) {
+ uninline_failed_info[e] = std::vector<_inodeno_t>();
+ }
+ auto& v = uninline_failed_info.at(e);
+ v.push_back(ino);
+ paths[ino] = path;
+ }
+
+ std::unordered_map<int, std::vector<_inodeno_t>>& get_uninline_failed_info() {
+ return uninline_failed_info;
+ }
+
+ std::unordered_map<_inodeno_t, std::string>& get_paths() {
+ return paths;
+ }
+
+ void record_uninline_started() {
+ uninline_started++;
+ }
+ void record_uninline_passed() {
+ uninline_passed++;
+ }
+ void record_uninline_failed() {
+ uninline_failed++;
+ }
+ void record_uninline_skipped() {
+ uninline_skipped++;
+ }
+ uint64_t get_uninline_started() const {
+ return uninline_started;
+ }
+ uint64_t get_uninline_passed() const {
+ return uninline_passed;
+ }
+ uint64_t get_uninline_failed() const {
+ return uninline_failed;
+ }
+ uint64_t get_uninline_skipped() const {
+ return uninline_skipped;
+ }
+
protected:
const std::string tag;
bool is_tag_internal;
@@ -76,6 +119,15 @@ protected:
bool repaired = false; // May be set during scrub if repairs happened
unsigned epoch_last_forwarded = 0;
unsigned num_pending = 0;
+ // errno -> [ino1, ino2, ino3, ...]
+ std::unordered_map<int, std::vector<_inodeno_t>> uninline_failed_info;
+ std::unordered_map<_inodeno_t, std::string> paths;
+
+ // scrub counters
+ uint64_t uninline_started = 0;
+ uint64_t uninline_passed = 0;
+ uint64_t uninline_failed = 0;
+ uint64_t uninline_skipped = 0;
};
typedef std::shared_ptr<ScrubHeader> ScrubHeaderRef;
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
index 28392f53366..7ec77a31de3 100644
--- a/src/mds/ScrubStack.cc
+++ b/src/mds/ScrubStack.cc
@@ -17,6 +17,7 @@
#include "mds/MDSRank.h"
#include "mds/MDCache.h"
#include "mds/MDSContinuation.h"
+#include "osdc/Objecter.h"
#define dout_context g_ceph_context
#define dout_subsys ceph_subsys_mds
@@ -75,6 +76,7 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top)
dout(10) << __func__ << " with {" << *in << "}" << ", top=" << top << dendl;
in->scrub_initialize(header);
+ in->uninline_initialize();
} else if (CDir *dir = dynamic_cast<CDir*>(obj)) {
if (dir->scrub_is_in_progress()) {
dout(10) << __func__ << " with {" << *dir << "}" << ", already in scrubbing" << dendl;
@@ -106,6 +108,55 @@ int ScrubStack::_enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top)
return 0;
}
+void ScrubStack::purge_scrub_counters(std::string_view tag)
+{
+ for (auto& stat : mds_scrub_stats) {
+ if (tag == "all") {
+ stat.counters.clear();
+ } else {
+ auto it = stat.counters.find(std::string(tag));
+ if (it != stat.counters.end()) {
+ stat.counters.erase(it);
+ }
+ }
+ }
+}
+
+// called from tick
+void ScrubStack::purge_old_scrub_counters()
+{
+ // "mds_scrub_stats_review_period" must be in number of days
+ auto review_period = ceph::make_timespan(_mds_scrub_stats_review_period * 24 * 60 * 60);
+ auto now = coarse_real_clock::now();
+
+ dout(20) << __func__ << " review_period:" << review_period << dendl;
+
+ for (mds_rank_t rank = 0; rank < (mds_rank_t)mds_scrub_stats.size(); rank++) {
+ auto& counters = mds_scrub_stats[rank].counters;
+ for (auto it = counters.begin(); it != counters.end(); ) {
+ auto curr = it;
+ auto c = (*it).second;
+ auto elapsed = now - c.start_time;
+ dout(20) << __func__
+ << " rank(" << rank << ") :"
+ << " elapsed:" << elapsed
+ << dendl;
+ ++it;
+ if (elapsed >= review_period) {
+ counters.erase(curr);
+ }
+ }
+ }
+}
+
+void ScrubStack::init_scrub_counters(std::string_view path, std::string_view tag)
+{
+ scrub_counters_t sc{coarse_real_clock::now(), std::string(path), 0, 0, 0};
+ for (auto& stat : mds_scrub_stats) {
+ stat.counters[std::string(tag)] = sc;
+ }
+}
+
int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top)
{
// abort in progress
@@ -133,6 +184,10 @@ int ScrubStack::enqueue(CInode *in, ScrubHeaderRef& header, bool top)
//to make sure mdsdir is always on the top
top = false;
}
+
+ std::string path;
+ in->make_path_string(path);
+ init_scrub_counters(path, header->get_tag());
int r = _enqueue(in, header, top);
if (r < 0)
return r;
@@ -227,6 +282,7 @@ void ScrubStack::kick_off_scrubs()
// it's a regular file, symlink, or hard link
dequeue(in); // we only touch it this once, so remove from stack
+ uninline_data(in, new C_MDSInternalNoop);
scrub_file_inode(in);
} else {
bool added_children = false;
@@ -235,6 +291,7 @@ void ScrubStack::kick_off_scrubs()
if (done) {
dout(20) << __func__ << " dir inode, done" << dendl;
dequeue(in);
+ in->uninline_finished();
}
if (added_children) {
// dirfrags were queued at top of stack
@@ -707,6 +764,43 @@ void ScrubStack::scrub_status(Formatter *f) {
f->close_section(); // scrub id
}
f->close_section(); // scrubs
+
+ if (mds_scrub_stats.size()) {
+ f->open_object_section("scrub_stats");
+ for (auto& [tag, ctrs] : mds_scrub_stats[0].counters) {
+ uint64_t started = 0;
+ uint64_t passed = 0;
+ uint64_t failed = 0;
+ uint64_t skipped = 0;
+ for (auto& stats : mds_scrub_stats) {
+ if (auto it = stats.counters.find(tag); it != stats.counters.end()) {
+ auto& [t, c] = *it;
+ started += c.uninline_started;
+ passed += c.uninline_passed;
+ failed += c.uninline_failed;
+ skipped += c.uninline_skipped;
+ }
+ }
+ f->open_object_section(tag);
+ {
+ f->dump_stream("start_time") << ctrs.start_time;
+ std::string path = ctrs.origin_path;
+ if (path == "") {
+ path = "/";
+ } else if (path.starts_with("~mds")) {
+ path = "~mdsdir";
+ }
+ f->dump_string("path", path);
+ f->dump_int("uninline_started", started);
+ f->dump_int("uninline_passed", passed);
+ f->dump_int("uninline_failed", failed);
+ f->dump_int("uninline_skipped", skipped);
+ }
+ f->close_section(); // tag
+ }
+ f->close_section(); // scrub_stats
+ }
+
f->close_section(); // result
}
@@ -936,6 +1030,7 @@ void ScrubStack::handle_scrub(const cref_t<MMDSScrub> &m)
header->set_origin(m->get_origin());
scrubbing_map.emplace(header->get_tag(), header);
}
+
for (auto dir : dfs) {
queued.insert_raw(dir->get_frag());
_enqueue(dir, header, true);
@@ -1016,6 +1111,7 @@ void ScrubStack::handle_scrub(const cref_t<MMDSScrub> &m)
const auto& header = in->get_scrub_header();
header->set_epoch_last_forwarded(scrub_epoch);
in->scrub_finished();
+ in->uninline_finished();
kick_off_scrubs();
}
@@ -1052,6 +1148,10 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
bool any_finished = false;
bool any_repaired = false;
std::set<std::string> scrubbing_tags;
+ std::unordered_map<std::string, unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info;
+ std::unordered_map<_inodeno_t, std::string> paths;
+ std::unordered_map<std::string, std::vector<uint64_t>> counters;
+
for (auto it = scrubbing_map.begin(); it != scrubbing_map.end(); ) {
auto& header = it->second;
if (header->get_num_pending() ||
@@ -1062,6 +1162,17 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
any_finished = true;
if (header->get_repaired())
any_repaired = true;
+ auto& ufi = header->get_uninline_failed_info();
+ uninline_failed_meta_info[it->first] = ufi;
+ ufi.clear();
+ paths.merge(header->get_paths());
+ ceph_assert(header->get_paths().size() == 0);
+ std::vector<uint64_t> c{header->get_uninline_started(),
+ header->get_uninline_passed(),
+ header->get_uninline_failed(),
+ header->get_uninline_skipped()
+ };
+ counters[header->get_tag()] = c;
scrubbing_map.erase(it++);
} else {
++it;
@@ -1071,7 +1182,11 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
scrub_epoch = m->get_epoch();
auto ack = make_message<MMDSScrubStats>(scrub_epoch,
- std::move(scrubbing_tags), clear_stack);
+ std::move(scrubbing_tags),
+ std::move(uninline_failed_meta_info),
+ std::move(paths),
+ std::move(counters),
+ clear_stack);
mdcache->mds->send_message_mds(ack, 0);
if (any_finished)
@@ -1085,7 +1200,40 @@ void ScrubStack::handle_scrub_stats(const cref_t<MMDSScrubStats> &m)
stat.epoch_acked = m->get_epoch();
stat.scrubbing_tags = m->get_scrubbing_tags();
stat.aborting = m->is_aborting();
+ for (auto& [scrub_tag, errno_map] : m->get_uninline_failed_meta_info()) {
+ stat.uninline_failed_meta_info[scrub_tag] = errno_map;
+ }
+ stat.paths.insert(m->get_paths().begin(), m->get_paths().end());;
+ for (auto& [tag, v] : m->get_counters()) {
+ stat.counters[tag].uninline_started = v[0];
+ stat.counters[tag].uninline_passed = v[1];
+ stat.counters[tag].uninline_failed = v[2];
+ stat.counters[tag].uninline_skipped = v[3];
+ }
+ }
+ }
+}
+
+void ScrubStack::move_uninline_failures_to_damage_table()
+{
+ auto mds = mdcache->mds;
+
+ for (mds_rank_t rank = 0; rank < (mds_rank_t)mds_scrub_stats.size(); rank++) {
+ auto& ufmi = mds_scrub_stats[rank].uninline_failed_meta_info;
+ auto& paths = mds_scrub_stats[rank].paths;
+
+ for (const auto& [scrub_tag, errno_ino_vec_map] : ufmi) {
+ for (const auto& [errno_, ino_vec] : errno_ino_vec_map) {
+ for (auto ino : ino_vec) {
+ mds->damage_table.notify_uninline_failed(ino, rank, errno_, scrub_tag, paths[ino]);
+ }
+ }
}
+ ufmi.clear();
+ paths.clear();
+ // do not clear the counters map; we'll clear them later:
+ // - on user request or
+ // - after a grace period
}
}
@@ -1152,6 +1300,18 @@ void ScrubStack::advance_scrub_status()
any_finished = true;
if (header->get_repaired())
any_repaired = true;
+ auto& ufmi = mds_scrub_stats[0].uninline_failed_meta_info;
+ ufmi[it->first] = header->get_uninline_failed_info();
+ mds_scrub_stats[0].paths.merge(header->get_paths());
+ move_uninline_failures_to_damage_table();
+
+ auto& c = mds_scrub_stats[0].counters;
+ auto& sc = c[header->get_tag()];
+ sc.uninline_started = header->get_uninline_started();
+ sc.uninline_passed = header->get_uninline_passed();
+ sc.uninline_failed = header->get_uninline_failed();
+ sc.uninline_skipped = header->get_uninline_skipped();
+
scrubbing_map.erase(it++);
} else {
++it;
@@ -1159,7 +1319,6 @@ void ScrubStack::advance_scrub_status()
}
++scrub_epoch;
-
for (auto& r : up_mds) {
if (r == 0)
continue;
@@ -1197,3 +1356,23 @@ void ScrubStack::handle_mds_failure(mds_rank_t mds)
if (kick)
kick_off_scrubs();
}
+
+void ScrubStack::uninline_data(CInode *in, Context *fin)
+{
+ dout(10) << "(uninline_data) starting data uninlining for " << *in << dendl;
+
+ MDRequestRef mdr = in->mdcache->request_start_internal(CEPH_MDS_OP_UNINLINE_DATA);
+ mdr->set_filepath(filepath(in->ino()));
+ mdr->snapid = CEPH_NOSNAP;
+ mdr->no_early_reply = true;
+ mdr->internal_op_finish = fin;
+
+ in->mdcache->dispatch_request(mdr);
+}
+
+void ScrubStack::handle_conf_change(const std::set<std::string>& changed)
+{
+ if (changed.count("mds_scrub_stats_review_period")) {
+ _mds_scrub_stats_review_period = g_conf().get_val<uint64_t>("mds_scrub_stats_review_period");
+ }
+}
diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h
index 756ebd9cb0e..4d0a0570d35 100644
--- a/src/mds/ScrubStack.h
+++ b/src/mds/ScrubStack.h
@@ -22,6 +22,8 @@
#include "ScrubHeader.h"
#include "common/LogClient.h"
+#include "common/Cond.h"
+#include "common/ceph_time.h"
#include "include/elist.h"
#include "messages/MMDSScrub.h"
#include "messages/MMDSScrubStats.h"
@@ -36,7 +38,9 @@ public:
clog(clog),
finisher(finisher_),
scrub_stack(member_offset(MDSCacheObject, item_scrub)),
- scrub_waiting(member_offset(MDSCacheObject, item_scrub)) {}
+ scrub_waiting(member_offset(MDSCacheObject, item_scrub)) {
+ _mds_scrub_stats_review_period = g_conf().get_val<uint64_t>("mds_scrub_stats_review_period");
+ }
~ScrubStack() {
ceph_assert(scrub_stack.empty());
ceph_assert(!scrubs_in_progress);
@@ -103,6 +107,14 @@ public:
bool remove_inode_if_stacked(CInode *in);
+ void move_uninline_failures_to_damage_table();
+
+ void init_scrub_counters(std::string_view path, std::string_view tag);
+ void purge_scrub_counters(std::string_view tag);
+ void purge_old_scrub_counters(); // on tick
+ void handle_conf_change(const std::set<std::string>& changed);
+
+
MDCache *mdcache;
protected:
@@ -132,10 +144,21 @@ protected:
// check if any mds is aborting scrub after mds.0 starts
bool scrub_any_peer_aborting = true;
+ struct scrub_counters_t {
+ ceph::coarse_real_clock::time_point start_time = coarse_real_clock::now();
+ std::string origin_path;
+ uint64_t uninline_started = 0;
+ uint64_t uninline_passed = 0;
+ uint64_t uninline_failed = 0;
+ uint64_t uninline_skipped = 0;
+ };
struct scrub_stat_t {
unsigned epoch_acked = 0;
std::set<std::string> scrubbing_tags;
bool aborting = false;
+ std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info;
+ std::unordered_map<_inodeno_t, std::string> paths;
+ std::unordered_map<std::string, scrub_counters_t> counters; // map(scrub_tag -> counters)
};
std::vector<scrub_stat_t> mds_scrub_stats;
@@ -154,6 +177,9 @@ private:
friend std::ostream &operator<<(std::ostream &os, const State &state);
friend class C_InodeValidated;
+ friend class C_IO_DataUninlined;
+ friend class C_MDC_DataUninlinedSubmitted;
+ friend class MDCache;
int _enqueue(MDSCacheObject *obj, ScrubHeaderRef& header, bool top);
/**
@@ -267,6 +293,7 @@ private:
void handle_scrub(const cref_t<MMDSScrub> &m);
void handle_scrub_stats(const cref_t<MMDSScrubStats> &m);
+ void uninline_data(CInode *in, Context *fin);
State state = STATE_IDLE;
bool clear_stack = false;
@@ -274,6 +301,8 @@ private:
// list of pending context completions for asynchronous scrub
// control operations.
std::vector<Context *> control_ctxs;
+
+ uint64_t _mds_scrub_stats_review_period = 1; // 1 day
};
#endif /* SCRUBSTACK_H_ */
diff --git a/src/messages/MMDSScrubStats.h b/src/messages/MMDSScrubStats.h
index 2cbb7f2f211..9252f99ac8a 100644
--- a/src/messages/MMDSScrubStats.h
+++ b/src/messages/MMDSScrubStats.h
@@ -18,7 +18,7 @@
#include "messages/MMDSOp.h"
class MMDSScrubStats : public MMDSOp {
- static constexpr int HEAD_VERSION = 1;
+ static constexpr int HEAD_VERSION = 2;
static constexpr int COMPAT_VERSION = 1;
public:
@@ -38,6 +38,15 @@ public:
bool is_finished(const std::string& tag) const {
return update_scrubbing && !scrubbing_tags.count(tag);
}
+ const std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>>& get_uninline_failed_meta_info() const {
+ return uninline_failed_meta_info;
+ }
+ const std::unordered_map<_inodeno_t, std::string>& get_paths() const {
+ return paths;
+ }
+ const std::unordered_map<std::string, std::vector<uint64_t>>& get_counters() const {
+ return counters;
+ }
void encode_payload(uint64_t features) override {
using ceph::encode;
@@ -45,6 +54,7 @@ public:
encode(scrubbing_tags, payload);
encode(update_scrubbing, payload);
encode(aborting, payload);
+ encode_uninline_failed_info();
}
void decode_payload() override {
using ceph::decode;
@@ -53,6 +63,90 @@ public:
decode(scrubbing_tags, p);
decode(update_scrubbing, p);
decode(aborting, p);
+ if (header.version >= 2) {
+ decode_uninline_failed_info(p);
+ }
+ }
+
+ void encode_uninline_failed_info() {
+ using ceph::encode;
+ int count = (int)uninline_failed_meta_info.size();
+ encode(count, payload);
+ for (const auto& [tag, meta_info_map] : uninline_failed_meta_info) {
+ encode(tag, payload);
+ count = (int)meta_info_map.size();
+ encode(count, payload);
+ for (const auto& [error_code, ino_vec] : meta_info_map) {
+ encode(error_code, payload);
+ encode(ino_vec, payload);
+ }
+ }
+ count = (int)paths.size();
+ encode(count, payload);
+ for (auto& [ino, path] : paths) {
+ encode(ino, payload);
+ encode(path, payload);
+ }
+ count = (int)counters.size();
+ encode(count, payload);
+ for (auto& [tag, v] : counters) {
+ encode(tag, payload);
+ uint64_t started = v[0];
+ uint64_t passed = v[1];
+ uint64_t failed = v[2];
+ uint64_t skipped = v[3];
+
+ encode(started, payload);
+ encode(passed, payload);
+ encode(failed, payload);
+ encode(skipped, payload);
+ }
+ }
+ void decode_uninline_failed_info(ceph::bufferlist::const_iterator& p) {
+ using ceph::decode;
+ int tag_count = 0;
+ decode(tag_count, p);
+ while (tag_count--) {
+ std::string tag;
+ decode(tag, p);
+ int count = 0;
+ decode(count, p);
+ std::unordered_map<int, std::vector<_inodeno_t>> uninline_failed_info;
+ while (count--) {
+ int error_code;
+ std::vector<_inodeno_t> ino_vec;
+ decode(error_code, p);
+ decode(ino_vec, p);
+ uninline_failed_info[error_code] = std::move(ino_vec);
+ }
+ uninline_failed_meta_info[tag] = std::move(uninline_failed_info);
+ }
+ int count = 0;
+ decode(count, p);
+ while (count--) {
+ _inodeno_t ino;
+ std::string path;
+ decode(ino, p);
+ decode(path, p);
+ paths[ino] = path;
+ }
+ count = 0;
+ decode(count, p);
+ while (count--) {
+ std::string tag;
+ decode(tag, p);
+ uint64_t started = 0;
+ uint64_t passed = 0;
+ uint64_t failed = 0;
+ uint64_t skipped = 0;
+
+ decode(started, p);
+ decode(passed, p);
+ decode(failed, p);
+ decode(skipped, p);
+ std::vector<uint64_t> c{started, passed, failed, skipped};
+ counters[tag] = c;
+ }
}
protected:
@@ -65,6 +159,15 @@ protected:
MMDSScrubStats(unsigned e, const std::set<std::string>& tags, bool abrt=false) :
MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION),
epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt) {}
+ MMDSScrubStats(unsigned e, const std::set<std::string>& tags,
+ std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>>&& ufmi,
+ std::unordered_map<_inodeno_t, std::string>&& paths_,
+ std::unordered_map<std::string, std::vector<uint64_t>>&& counters_,
+ bool abrt = false) :
+ MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION),
+ epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt),
+ uninline_failed_meta_info(std::move(ufmi)), paths(std::move(paths_)),
+ counters(std::move(counters_)) {}
~MMDSScrubStats() override {}
private:
@@ -72,6 +175,10 @@ private:
std::set<std::string> scrubbing_tags;
bool update_scrubbing = false;
bool aborting = false;
+ // <tag, <error_code, [ino1, ino2, ...]>>
+ std::unordered_map<std::string, std::unordered_map<int, std::vector<_inodeno_t>>> uninline_failed_meta_info;
+ std::unordered_map<_inodeno_t, std::string> paths;
+ std::unordered_map<std::string, std::vector<uint64_t>> counters;
template<class T, typename... Args>
friend boost::intrusive_ptr<T> ceph::make_message(Args&&... args);
diff --git a/src/mon/NVMeofGwMap.cc b/src/mon/NVMeofGwMap.cc
index b20060c68fe..7d886344244 100755
--- a/src/mon/NVMeofGwMap.cc
+++ b/src/mon/NVMeofGwMap.cc
@@ -37,9 +37,15 @@ void NVMeofGwMap::to_gmap(
for (const auto& gw_created_pair: gw_created_map) {
const auto& gw_id = gw_created_pair.first;
const auto& gw_created = gw_created_pair.second;
+ gw_availability_t availability = gw_created.availability;
+ // Gateways expect to see UNAVAILABLE, not DELETING
+ // for entries in DELETING state
+ if (gw_created.availability == gw_availability_t::GW_DELETING) {
+ availability = gw_availability_t::GW_UNAVAILABLE;
+ }
auto gw_state = NvmeGwClientState(
- gw_created.ana_grp_id, epoch, gw_created.availability);
+ gw_created.ana_grp_id, epoch, availability);
for (const auto& sub: gw_created.subsystems) {
gw_state.subsystems.insert({
sub.nqn,
@@ -78,10 +84,36 @@ int NVMeofGwMap::cfg_add_gw(
for (auto& itr: created_gws[group_key]) {
allocated.insert(itr.second.ana_grp_id);
if (itr.first == gw_id) {
- dout(1) << __func__ << " ERROR create GW: already exists in map "
- << gw_id << dendl;
- return -EEXIST ;
+ if (itr.second.availability != gw_availability_t::GW_DELETING) {
+ dout(1) << __func__ << " ERROR create GW: already exists in map "
+ << gw_id << dendl;
+ return -EEXIST;
+ } else {
+ //this GW exists in the map in "Deleting" state
+ // but user again creates it - need just set attribute values
+ created_gws[group_key][gw_id].performed_full_startup = true;
+ created_gws[group_key][gw_id].availability
+ = gw_availability_t::GW_CREATED;
+ dout(4) << "GW in Deleting state " << gw_id
+ << " was created again" << dendl;
+ return 0;
+ }
}
+ if (itr.second.availability == gw_availability_t::GW_DELETING) {
+ //Was found some GW in "Deleting" state. Just to inherit its ANA group
+ NvmeGwMonState & gw_created = created_gws[group_key][itr.first];
+ created_gws[group_key][gw_id] = gw_created;
+ // Deep copy of all data of "Deleting" GW
+ created_gws[group_key][gw_id].performed_full_startup = true;
+ created_gws[group_key][gw_id].availability
+ = gw_availability_t::GW_CREATED;
+ dout(4) << "Created GW inherits ANA group of deleting GW-id :"
+ << itr.first << " group " << itr.second.ana_grp_id << dendl;
+ do_erase_gw_id(itr.first, group_key);
+ dout(4) << "Created GWS after create/delete: "
+ << created_gws << dendl;
+ return 0;
+ }
}
if (allocated.size() == MAX_SUPPORTED_ANA_GROUPS) {
dout(4) << "Warning: cannot add GW " << gw_id
@@ -125,7 +157,40 @@ int NVMeofGwMap::cfg_add_gw(
int NVMeofGwMap::cfg_delete_gw(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
{
- int rc = 0;
+ if (HAVE_FEATURE(mon->get_quorum_con_features(), NVMEOFHA)) {
+ dout(10) << " has NVMEOFHA: 1" << dendl;
+ for (auto& gws_states: created_gws[group_key]) {
+ if (gws_states.first == gw_id) {
+ auto& state = gws_states.second;
+ state.availability = gw_availability_t::GW_DELETING;
+ dout(4) << " Deleting GW :"<< gw_id << " in state "
+ << state.availability << " Resulting GW availability: "
+ << state.availability << dendl;
+ return 0;
+ }
+ }
+ } else {
+ return do_delete_gw(gw_id, group_key);
+ }
+ return -EINVAL;
+}
+
+int NVMeofGwMap::do_erase_gw_id(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key) {
+
+ fsm_timers[group_key].erase(gw_id);
+ if (fsm_timers[group_key].size() == 0)
+ fsm_timers.erase(group_key);
+
+ created_gws[group_key].erase(gw_id);
+ if (created_gws[group_key].size() == 0)
+ created_gws.erase(group_key);
+ return 0;
+}
+
+int NVMeofGwMap::do_delete_gw(
+ const NvmeGwId &gw_id, const NvmeGroupKey& group_key)
+{
for (auto& gws_states: created_gws[group_key]) {
if (gws_states.first == gw_id) {
@@ -136,26 +201,52 @@ int NVMeofGwMap::cfg_delete_gw(
gw_id, group_key,state_itr.second , state_itr.first, modified);
}
dout(10) << " Delete GW :"<< gw_id << " ANA grpid: "
- << state.ana_grp_id << dendl;
+ << state.ana_grp_id << dendl;
for (auto& itr: created_gws[group_key]) {
// Update state map and other maps
remove_grp_id(itr.first, group_key, state.ana_grp_id);
// of all created gateways. Removed key = anagrp
}
- fsm_timers[group_key].erase(gw_id);
- if (fsm_timers[group_key].size() == 0)
- fsm_timers.erase(group_key);
-
- created_gws[group_key].erase(gw_id);
- if (created_gws[group_key].size() == 0)
- created_gws.erase(group_key);
- return rc;
+ return do_erase_gw_id(gw_id, group_key);
}
}
return -EINVAL;
}
+int NVMeofGwMap::get_num_namespaces(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, const BeaconSubsystems& subs)
+{
+ auto grpid = created_gws[group_key][gw_id].ana_grp_id ;
+ int num_ns = 0;
+ for (auto & subs_it:subs) {
+ for (auto & ns :subs_it.namespaces) {
+ if (ns.anagrpid == (grpid+1)) {
+ num_ns++;
+ }
+ }
+ }
+ return num_ns;
+}
+
+void NVMeofGwMap::track_deleting_gws(const NvmeGroupKey& group_key,
+ const BeaconSubsystems& subs, bool &propose_pending)
+{
+ propose_pending = false;
+ for (auto& itr: created_gws[group_key]) {
+ auto &gw_id = itr.first;
+ if (itr.second.availability == gw_availability_t::GW_DELETING) {
+ int num_ns = 0;
+ if ( (num_ns = get_num_namespaces(gw_id, group_key, subs)) == 0) {
+ do_delete_gw(gw_id, group_key);
+ propose_pending = true;
+ }
+ dout(4) << " to delete ? " << gw_id << " num_ns " << num_ns << dendl;
+ break; // handle just one GW in "Deleting" state in time.
+ }
+ }
+}
+
int NVMeofGwMap::process_gw_map_gw_down(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key, bool &propose_pending)
{
@@ -192,17 +283,8 @@ void NVMeofGwMap::process_gw_map_ka(
dout(20) << "KA beacon from the GW " << gw_id
<< " in state " << (int)st.availability << dendl;
- if (st.availability == gw_availability_t::GW_CREATED) {
- // first time appears - allow IO traffic for this GW
- st.availability = gw_availability_t::GW_AVAILABLE;
- for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
- state_itr.second = gw_states_per_group_t::GW_STANDBY_STATE;
- }
- if (st.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) { // not a redundand GW
- st.active_state(st.ana_grp_id);
- }
- propose_pending = true;
- } else if (st.availability == gw_availability_t::GW_UNAVAILABLE) {
+ if (st.availability == gw_availability_t::GW_CREATED ||
+ st.availability == gw_availability_t::GW_UNAVAILABLE) {
st.availability = gw_availability_t::GW_AVAILABLE;
if (st.ana_grp_id == REDUNDANT_GW_ANA_GROUP_ID) {
for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
@@ -237,7 +319,9 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
// 1. Failover missed : is there is a GW in unavailable state?
// if yes, is its ANA group handled by some other GW?
- if (state.availability == gw_availability_t::GW_UNAVAILABLE &&
+ if ((state.availability == gw_availability_t::GW_UNAVAILABLE ||
+ state.availability == gw_availability_t::GW_DELETING ||
+ state.availability == gw_availability_t::GW_CREATED) &&
state.ana_grp_id != REDUNDANT_GW_ANA_GROUP_ID) {
auto found_gw_for_ana_group = false;
for (auto& gw_state2 : gws_states) {
@@ -251,7 +335,7 @@ void NVMeofGwMap::handle_abandoned_ana_groups(bool& propose)
}
// choose the GW for handle ana group
if (found_gw_for_ana_group == false) {
- dout(10) << "Was not found the GW " << " that handles ANA grp "
+ dout(20) << "Was not found the GW " << " that handles ANA grp "
<< (int)state.ana_grp_id << " find candidate "<< dendl;
for (auto& state_itr: created_gws[group_key][gw_id].sm_state) {
find_failover_candidate(gw_id, group_key, state_itr.first, propose);
@@ -277,14 +361,23 @@ void NVMeofGwMap::set_failover_gw_for_ANA_group(
const NvmeGwId &gw_id, NvmeAnaGrpId ANA_groupid)
{
NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
+ NvmeGwMonState& failed_gw_state = created_gws[group_key][failed_gw_id];
epoch_t epoch;
dout(10) << "Found failover GW " << gw_id
<< " for ANA group " << (int)ANA_groupid << dendl;
+ if (failed_gw_state.availability == gw_availability_t::GW_CREATED) {
+ dout(10) << "Failover GW " << gw_id <<
+ " takes over the group of GW in Created state " <<
+ failed_gw_id << dendl;
+ // just take over on the group of created GW
+ gw_state.active_state(ANA_groupid);
+ return;
+ }
int rc = blocklist_gw(failed_gw_id, group_key, ANA_groupid, epoch, true);
if (rc) {
//start failover even when nonces are empty !
gw_state.active_state(ANA_groupid);
- } else{
+ } else {
gw_state.sm_state[ANA_groupid] =
gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL;
gw_state.blocklist_data[ANA_groupid].osd_epoch = epoch;
@@ -507,7 +600,7 @@ void NVMeofGwMap::fsm_handle_gw_alive(
// ana group wouldnt be taken back during blocklist wait period
cancel_timer(gw_id, group_key, grpid);
map_modified = true;
- } else{
+ } else {
dout(20) << "osd epoch not changed from "
<< gw_map.blocklist_data[grpid].osd_epoch
<< " to "<< last_osd_epoch
@@ -576,6 +669,8 @@ void NVMeofGwMap::fsm_handle_gw_down(
void NVMeofGwMap::fsm_handle_gw_delete(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
gw_states_per_group_t state , NvmeAnaGrpId grpid, bool &map_modified) {
+ //This function is called when GW already passed Failover and its native
+ //Ana group has no volumes, so some states are not relevant
switch (state) {
case gw_states_per_group_t::GW_STANDBY_STATE:
case gw_states_per_group_t::GW_IDLE_STATE:
@@ -583,8 +678,8 @@ void NVMeofGwMap::fsm_handle_gw_delete(
{
NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
- // Try to find GW that temporary owns my group - if found,
- // this GW should pass to standby for this group
+ // Try to find GW that temporary owns gw-id group that is about to disappear!
+ // - if found, this GW should pass to standby for this group
if (grpid == gw_state.ana_grp_id) {
auto& gateway_states = created_gws[group_key];
for (auto& gs: gateway_states) {
@@ -605,43 +700,6 @@ void NVMeofGwMap::fsm_handle_gw_delete(
}
break;
- case gw_states_per_group_t::GW_WAIT_BLOCKLIST_CMPL:
- {
- NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
- cancel_timer(gw_id, group_key, grpid);
- map_modified = true;
- gw_state.standby_state(grpid);
- }
- break;
-
- case gw_states_per_group_t::GW_WAIT_FAILBACK_PREPARED:
- {
- cancel_timer(gw_id, group_key, grpid);
- map_modified = true;
- for (auto& nqn_gws_state: created_gws[group_key]) {
- auto& st = nqn_gws_state.second;
-
- // found GW that was intended for Failback for this ana grp
- if (st.sm_state[grpid] ==
- gw_states_per_group_t::GW_OWNER_WAIT_FAILBACK_PREPARED) {
- dout(4) << "Warning: Outgoing Failback when GW is deleted "
- << "- to rollback it GW " << gw_id << "for ANA Group "
- << grpid << dendl;
- st.standby_state(grpid);
- break;
- }
- }
- }
- break;
-
- case gw_states_per_group_t::GW_ACTIVE_STATE:
- {
- NvmeGwMonState& gw_state = created_gws[group_key][gw_id];
- map_modified = true;
- gw_state.standby_state(grpid);
- }
- break;
-
default: {
dout(4) << "Error : Invalid state " << state
<< "for GW " << gw_id << dendl;
@@ -744,8 +802,14 @@ int NVMeofGwMap::blocklist_gw(
{
// find_already_created_gw(gw_id, group_key);
NvmeGwMonState& gw_map = created_gws[group_key][gw_id];
+ NvmeNonceVector nonces;
+ for (auto& state_itr: gw_map.sm_state) {
+ // to make blocklist on all clusters of the failing GW
+ nonces.insert(nonces.end(), gw_map.nonce_map[state_itr.first].begin(),
+ gw_map.nonce_map[state_itr.first].end());
+ }
- if (gw_map.nonce_map[grpid].size() > 0) {
+ if (nonces.size() > 0) {
NvmeNonceVector &nonce_vector = gw_map.nonce_map[grpid];;
std::string str = "[";
entity_addrvec_t addr_vect;
@@ -759,10 +823,10 @@ int NVMeofGwMap::blocklist_gw(
str += it;
}
str += "]";
- bool rc = addr_vect.parse(&str[0]);
- dout(10) << str << " rc " << rc << " network vector: " << addr_vect
+ bool success = addr_vect.parse(&str[0]);
+ dout(10) << str << " parse success " << success << " network vector: " << addr_vect
<< " " << addr_vect.size() << dendl;
- if (rc) {
+ if (!success) {
return 1;
}
@@ -781,14 +845,14 @@ int NVMeofGwMap::blocklist_gw(
new CMonRequestProposal(this, addr_vect, expires)
);
// return false;
- } else{
+ } else {
mon->nvmegwmon()->request_proposal(mon->osdmon());
}
}
dout(10) << str << " mon->osdmon()->blocklist: epoch : " << epoch
<< " address vector: " << addr_vect << " "
<< addr_vect.size() << dendl;
- } else{
+ } else {
dout(4) << "Error: No nonces context present for gw: "
<< gw_id << " ANA group: " << grpid << dendl;
return 1;
diff --git a/src/mon/NVMeofGwMap.h b/src/mon/NVMeofGwMap.h
index 688a68662be..4c9d7966410 100755
--- a/src/mon/NVMeofGwMap.h
+++ b/src/mon/NVMeofGwMap.h
@@ -44,7 +44,8 @@ public:
std::map<NvmeGroupKey, NvmeGwTimers> fsm_timers;
void to_gmap(std::map<NvmeGroupKey, NvmeGwMonClientStates>& Gmap) const;
-
+ void track_deleting_gws(const NvmeGroupKey& group_key,
+ const BeaconSubsystems& subs, bool &propose_pending);
int cfg_add_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
int cfg_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
void process_gw_map_ka(
@@ -64,6 +65,9 @@ public:
void handle_gw_performing_fast_reboot(const NvmeGwId &gw_id,
const NvmeGroupKey& group_key, bool &map_modified);
private:
+ int do_delete_gw(const NvmeGwId &gw_id, const NvmeGroupKey& group_key);
+ int do_erase_gw_id(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key);
void add_grp_id(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
const NvmeAnaGrpId grpid);
@@ -95,7 +99,8 @@ private:
void set_failover_gw_for_ANA_group(
const NvmeGwId &failed_gw_id, const NvmeGroupKey& group_key,
const NvmeGwId &gw_id, NvmeAnaGrpId groupid);
-
+ int get_num_namespaces(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, const BeaconSubsystems& subs );
int get_timer(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
NvmeAnaGrpId anagrpid);
diff --git a/src/mon/NVMeofGwMon.cc b/src/mon/NVMeofGwMon.cc
index 651046515fe..b6faeb2e97c 100644
--- a/src/mon/NVMeofGwMon.cc
+++ b/src/mon/NVMeofGwMon.cc
@@ -105,6 +105,7 @@ void NVMeofGwMon::tick()
const auto cutoff = now - nvmegw_beacon_grace;
// Pass over all the stored beacons
+ NvmeGroupKey old_group_key;
for (auto &itr : last_beacon) {
auto& lb = itr.first;
auto last_beacon_time = itr.second;
@@ -114,6 +115,14 @@ void NVMeofGwMon::tick()
_propose_pending |= propose;
last_beacon.erase(lb);
} else {
+ BeaconSubsystems *subsystems =
+ &pending_map.created_gws[lb.group_key][lb.gw_id].subsystems;
+ if (subsystems && subsystems->size() && old_group_key != lb.group_key) {
+ // to call track_deleting_gws once per each group-key
+ pending_map.track_deleting_gws(lb.group_key, *subsystems, propose);
+ old_group_key = lb.group_key;
+ _propose_pending |= propose;
+ }
dout(20) << "beacon live for GW key: " << lb.gw_id << dendl;
}
}
@@ -299,29 +308,58 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
auto group_key = std::make_pair(pool, group);
dout(10) << "nvme-gw show pool " << pool << " group " << group << dendl;
- if (map.created_gws[group_key].size()) {
- f->open_object_section("common");
- f->dump_unsigned("epoch", map.epoch);
- f->dump_string("pool", pool);
- f->dump_string("group", group);
- f->dump_unsigned("num gws", map.created_gws[group_key].size());
+ f->open_object_section("common");
+ f->dump_unsigned("epoch", map.epoch);
+ f->dump_string("pool", pool);
+ f->dump_string("group", group);
+ if (HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA)) {
+ f->dump_string("features", "LB");
+ }
+ f->dump_unsigned("num gws", map.created_gws[group_key].size());
+ if (map.created_gws[group_key].size() == 0) {
+ f->close_section();
+ f->flush(rdata);
+ sstrm.str("");
+ } else {
sstrm << "[ ";
NvmeGwId gw_id;
+ BeaconSubsystems *subsystems = NULL;
for (auto& gw_created_pair: map.created_gws[group_key]) {
- gw_id = gw_created_pair.first;
- auto& st = gw_created_pair.second;
- sstrm << st.ana_grp_id+1 << " ";
+ gw_id = gw_created_pair.first;
+ auto& st = gw_created_pair.second;
+ if (st.availability != gw_availability_t::GW_DELETING) {
+ // not show ana group of deleting gw in the list -
+ // it is information for the GW used in rebalancing process
+ sstrm << st.ana_grp_id+1 << " ";
+ }
+ if (st.availability == gw_availability_t::GW_AVAILABLE) {
+ subsystems = &st.subsystems;
+ }
}
sstrm << "]";
f->dump_string("Anagrp list", sstrm.str());
- f->close_section();
-
+ std::map<NvmeAnaGrpId, uint16_t> num_ns;
+ uint16_t total_ns = 0;
+ if (subsystems && subsystems->size()) {
+ for (auto & subs_it:*subsystems) {
+ for (auto & ns :subs_it.namespaces) {
+ if (num_ns.find(ns.anagrpid) == num_ns.end()) num_ns[ns.anagrpid] = 0;
+ num_ns[ns.anagrpid] +=1;
+ total_ns += 1;
+ }
+ }
+ }
+ f->dump_unsigned("num-namespaces", total_ns);
+ f->open_array_section("Created Gateways:");
+ uint32_t i = 0;
for (auto& gw_created_pair: map.created_gws[group_key]) {
auto& gw_id = gw_created_pair.first;
auto& state = gw_created_pair.second;
+ i = 0;
f->open_object_section("stat");
f->dump_string("gw-id", gw_id);
f->dump_unsigned("anagrp-id",state.ana_grp_id+1);
+ f->dump_unsigned("num-namespaces", num_ns[state.ana_grp_id+1]);
f->dump_unsigned("performed-full-startup", state.performed_full_startup);
std::stringstream sstrm1;
sstrm1 << state.availability;
@@ -329,17 +367,18 @@ bool NVMeofGwMon::preprocess_command(MonOpRequestRef op)
sstrm1.str("");
for (auto &state_itr: map.created_gws[group_key][gw_id].sm_state) {
sstrm1 << " " << state_itr.first + 1 << ": "
- << state.sm_state[state_itr.first] << ",";
+ << state.sm_state[state_itr.first];
+ if (++i < map.created_gws[group_key][gw_id].sm_state.size())
+ sstrm1<< ", ";
}
f->dump_string("ana states", sstrm1.str());
f->close_section();
}
+ f->close_section();
+ f->close_section();
f->flush(rdata);
sstrm.str("");
}
- else {
- sstrm << "num_gws 0";
- }
getline(sstrm, rs);
mon.reply_command(op, err, rs, rdata, get_last_committed());
return true;
@@ -388,19 +427,18 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
<< " " << pool << " " << group << " rc " << rc << dendl;
sstrm.str("");
}
- }
- else{
+ } else {
rc = pending_map.cfg_delete_gw(id, group_key);
- if (rc == -EINVAL) {
+ if (rc == 0) {
+ bool propose = false;
+ // Simulate immediate Failover of this GW
+ process_gw_down(id, group_key, propose);
+ } else if (rc == -EINVAL) {
dout (4) << "Error: GW not found in the database " << id << " "
<< pool << " " << group << " rc " << rc << dendl;
err = 0;
sstrm.str("");
}
- if (rc == 0) {
- LastBeacon lb = {id, group_key};
- last_beacon.erase(lb);
- }
}
// propose pending would be generated by the PaxosService
if ((rc != -EEXIST) && (rc != -EINVAL)) {
@@ -423,6 +461,16 @@ bool NVMeofGwMon::prepare_command(MonOpRequestRef op)
return response;
}
+void NVMeofGwMon::process_gw_down(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, bool &propose_pending)
+{
+ LastBeacon lb = {gw_id, group_key};
+ auto it = last_beacon.find(lb);
+ if (it != last_beacon.end()) {
+ last_beacon.erase(it);
+ pending_map.process_gw_map_gw_down(gw_id, group_key, propose_pending);
+ }
+}
bool NVMeofGwMon::preprocess_beacon(MonOpRequestRef op)
{
@@ -527,7 +575,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
<< pending_map.created_gws[group_key][gw_id].nonce_map << dendl;
nonce_propose = true;
}
- } else {
+ } else {
dout(10) << "Warning: received empty nonce map in the beacon of GW "
<< gw_id << " " << dendl;
}
@@ -560,13 +608,7 @@ bool NVMeofGwMon::prepare_beacon(MonOpRequestRef op)
pending_map.process_gw_map_ka(gw_id, group_key, last_osd_epoch, propose);
// state set by GW client application
} else if (avail == gw_availability_t::GW_UNAVAILABLE) {
- LastBeacon lb = {gw_id, group_key};
-
- auto it = last_beacon.find(lb);
- if (it != last_beacon.end()) {
- last_beacon.erase(lb);
- pending_map.process_gw_map_gw_down(gw_id, group_key, propose);
- }
+ process_gw_down(gw_id, group_key, propose);
}
// Periodic: check active FSM timers
pending_map.update_active_timers(timer_propose);
diff --git a/src/mon/NVMeofGwMon.h b/src/mon/NVMeofGwMon.h
index acd72dfe0c4..f132c87d92a 100644
--- a/src/mon/NVMeofGwMon.h
+++ b/src/mon/NVMeofGwMon.h
@@ -84,7 +84,8 @@ public:
private:
void synchronize_last_beacon();
-
+ void process_gw_down(const NvmeGwId &gw_id,
+ const NvmeGroupKey& group_key, bool &propose_pending);
};
#endif /* MON_NVMEGWMONITOR_H_ */
diff --git a/src/mon/NVMeofGwSerialize.h b/src/mon/NVMeofGwSerialize.h
index cbda90ea379..b10eac88c2f 100755
--- a/src/mon/NVMeofGwSerialize.h
+++ b/src/mon/NVMeofGwSerialize.h
@@ -74,6 +74,9 @@ inline std::ostream& operator<<(
case gw_availability_t::GW_UNAVAILABLE:
os << "UNAVAILABLE";
break;
+ case gw_availability_t::GW_DELETING:
+ os << "DELETING"; break;
+
default:
os << "Invalid " << (int)value << " ";
}
diff --git a/src/mon/NVMeofGwTypes.h b/src/mon/NVMeofGwTypes.h
index 057c8cd3795..2dd3e11ba3a 100755
--- a/src/mon/NVMeofGwTypes.h
+++ b/src/mon/NVMeofGwTypes.h
@@ -42,6 +42,7 @@ enum class gw_availability_t {
GW_CREATED = 0,
GW_AVAILABLE,
GW_UNAVAILABLE,
+ GW_DELETING,
GW_DELETED
};
@@ -134,9 +135,12 @@ struct NvmeGwMonState {
: ana_grp_id(id), availability(gw_availability_t::GW_CREATED),
last_gw_map_epoch_valid(false), performed_full_startup(false) {}
void set_unavailable_state() {
- availability = gw_availability_t::GW_UNAVAILABLE;
- // after setting this state the next time monitor sees GW,
- // it expects it performed the full startup
+ if (availability != gw_availability_t::GW_DELETING) {
+ //for not to override Deleting
+ availability = gw_availability_t::GW_UNAVAILABLE;
+ }
+ // after setting this state, the next time monitor sees GW,
+ // it expects it performed the full startup
performed_full_startup = false;
}
void standby_state(NvmeAnaGrpId grpid) {
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 6bcb77ee5a9..ef4dd3fc4f0 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -612,6 +612,7 @@ public:
void on_backfill_reserved() override;
void on_backfill_canceled() override;
+ void on_recovery_cancelled() override {}
void on_recovery_reserved() override;
bool is_forced_recovery_or_backfill() const {
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index 944b77c2812..22222b7f7af 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -5827,6 +5827,7 @@ PeeringState::Recovering::react(const DeferRecovery &evt)
ps->state_set(PG_STATE_RECOVERY_WAIT);
pl->cancel_local_background_io_reservation();
release_reservations(true);
+ pl->on_recovery_cancelled();
pl->schedule_event_after(
std::make_shared<PGPeeringEvent>(
ps->get_osdmap_epoch(),
@@ -5844,6 +5845,7 @@ PeeringState::Recovering::react(const UnfoundRecovery &evt)
ps->state_set(PG_STATE_RECOVERY_UNFOUND);
pl->cancel_local_background_io_reservation();
release_reservations(true);
+ pl->on_recovery_cancelled();
return transit<NotRecovering>();
}
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
index f036bb44b11..11ac084a054 100644
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -419,6 +419,7 @@ public:
virtual void on_backfill_reserved() = 0;
virtual void on_backfill_canceled() = 0;
virtual void on_recovery_reserved() = 0;
+ virtual void on_recovery_cancelled() = 0;
// ================recovery space accounting ================
virtual bool try_reserve_recovery_space(
diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc
index 414de479dc7..4ece8880411 100644
--- a/src/osd/SnapMapper.cc
+++ b/src/osd/SnapMapper.cc
@@ -728,6 +728,50 @@ int SnapMapper::get_snaps(
return 0;
}
+void SnapMapper::update_snap_map(
+ const pg_log_entry_t& i,
+ MapCacher::Transaction<std::string, ceph::buffer::list> *_t)
+{
+ ceph_assert(i.soid.snap < CEPH_MAXSNAP);
+ dout(20) << __func__ << " " << i << dendl;
+ if (i.is_delete()) {
+ int r = remove_oid(
+ i.soid,
+ _t);
+ if (r)
+ dout(20) << __func__ << " remove_oid " << i.soid << " failed with " << r << dendl;
+ // On removal tolerate missing key corruption
+ ceph_assert(r == 0 || r == -ENOENT);
+ } else if (i.is_update()) {
+ ceph_assert(i.snaps.length() > 0);
+ std::vector<snapid_t> snaps;
+ bufferlist snapbl = i.snaps;
+ auto p = snapbl.cbegin();
+ try {
+ decode(snaps, p);
+ } catch (...) {
+ dout(20) << __func__ << " decode snaps failure on " << i << dendl;
+ snaps.clear();
+ }
+ std::set<snapid_t> _snaps(snaps.begin(), snaps.end());
+
+ if (i.is_clone() || i.is_promote()) {
+ add_oid(
+ i.soid,
+ _snaps,
+ _t);
+ } else if (i.is_modify()) {
+ int r = update_snaps(
+ i.soid,
+ _snaps,
+ 0,
+ _t);
+ ceph_assert(r == 0);
+ } else {
+ ceph_assert(i.is_clean());
+ }
+ }
+}
// -- purged snaps --
diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h
index f9a371932f6..a43bb5d94f0 100644
--- a/src/osd/SnapMapper.h
+++ b/src/osd/SnapMapper.h
@@ -394,6 +394,10 @@ private:
std::set<snapid_t> *snaps ///< [out] snaps
) const; ///< @return error, -ENOENT if oid is not recorded
+ void update_snap_map(
+ const pg_log_entry_t& i,
+ MapCacher::Transaction<std::string, ceph::buffer::list> *t);
+
/// Get snaps for oid - alternative interface
tl::expected<std::set<snapid_t>, SnapMapReaderI::result_t> get_snaps(
const hobject_t &hoid) const final;
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 0e4253b339a..b0bdc9fcb95 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -2220,7 +2220,7 @@ void PgScrubber::on_mid_scrub_abort(Scrub::delay_cause_t issue)
// that made any of the targets into a high-priority one. All that's left:
// delay the specific target that was aborted.
- auto& trgt = m_scrub_job->delay_on_failure(aborted_target.level(), 5s, issue,
+ auto& trgt = m_scrub_job->delay_on_failure(aborted_target.level(), issue,
scrub_clock_now);
/// \todo complete the merging of the deadline & target for non-hp targets
@@ -2251,8 +2251,7 @@ void PgScrubber::requeue_penalized(
return;
}
/// \todo fix the 5s' to use a cause-specific delay parameter
- auto& trgt =
- m_scrub_job->delay_on_failure(s_or_d, 5s, cause, scrub_clock_now);
+ auto& trgt = m_scrub_job->delay_on_failure(s_or_d, cause, scrub_clock_now);
ceph_assert(!trgt.queued);
m_osds->get_scrub_services().enqueue_target(trgt);
trgt.queued = true;
@@ -2274,7 +2273,7 @@ void PgScrubber::requeue_penalized(
m_osds->get_scrub_services().dequeue_target(m_pg_id, sister_level);
trgt2.queued = false;
}
- m_scrub_job->delay_on_failure(sister_level, 5s, cause, scrub_clock_now);
+ m_scrub_job->delay_on_failure(sister_level, cause, scrub_clock_now);
m_osds->get_scrub_services().enqueue_target(trgt2);
trgt2.queued = true;
}
@@ -2333,7 +2332,8 @@ Scrub::schedule_result_t PgScrubber::start_scrub_session(
// i.e. some time before setting 'snaptrim'.
dout(10) << __func__ << ": cannot scrub while snap-trimming" << dendl;
requeue_penalized(
- s_or_d, delay_both_targets_t::yes, delay_cause_t::pg_state, clock_now);
+ s_or_d, delay_both_targets_t::yes, delay_cause_t::snap_trimming,
+ clock_now);
return schedule_result_t::target_specific_failure;
}
diff --git a/src/osd/scrubber/scrub_job.cc b/src/osd/scrubber/scrub_job.cc
index ee33ee06706..7b05eea3941 100644
--- a/src/osd/scrubber/scrub_job.cc
+++ b/src/osd/scrubber/scrub_job.cc
@@ -12,6 +12,7 @@ using sched_conf_t = Scrub::sched_conf_t;
using scrub_schedule_t = Scrub::scrub_schedule_t;
using ScrubJob = Scrub::ScrubJob;
using delay_ready_t = Scrub::delay_ready_t;
+using namespace std::chrono;
namespace {
utime_t add_double(utime_t t, double d)
@@ -300,16 +301,42 @@ void ScrubJob::adjust_deep_schedule(
SchedTarget& ScrubJob::delay_on_failure(
scrub_level_t level,
- std::chrono::seconds delay,
- Scrub::delay_cause_t delay_cause,
+ delay_cause_t delay_cause,
utime_t scrub_clock_now)
{
+ seconds delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_delay"));
+ switch (delay_cause) {
+ case delay_cause_t::flags:
+ delay =
+ seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_after_noscrub"));
+ break;
+ case delay_cause_t::pg_state:
+ delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_pg_state"));
+ break;
+ case delay_cause_t::snap_trimming:
+ delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_trimming"));
+ break;
+ case delay_cause_t::interval:
+ delay = seconds(cct->_conf.get_val<int64_t>("osd_scrub_retry_new_interval"));
+ break;
+ case delay_cause_t::local_resources:
+ case delay_cause_t::aborted:
+ default:
+ // for all other possible delay causes: use the default delay
+ break;
+ }
+
auto& delayed_target =
(level == scrub_level_t::deep) ? deep_target : shallow_target;
delayed_target.sched_info.schedule.not_before =
std::max(scrub_clock_now, delayed_target.sched_info.schedule.not_before) +
utime_t{delay};
delayed_target.sched_info.last_issue = delay_cause;
+ dout(20) << fmt::format(
+ "delayed {}scrub due to {} for {}s. Updated: {}",
+ (level == scrub_level_t::deep ? "deep " : ""), delay_cause,
+ delay.count(), delayed_target)
+ << dendl;
return delayed_target;
}
diff --git a/src/osd/scrubber/scrub_job.h b/src/osd/scrubber/scrub_job.h
index 98a3e101f9b..b037084db6b 100644
--- a/src/osd/scrubber/scrub_job.h
+++ b/src/osd/scrubber/scrub_job.h
@@ -239,14 +239,14 @@ class ScrubJob {
/**
* For the level specified, set the 'not-before' time to 'now+delay',
- * so that this scrub target
- * would not be retried before 'delay' seconds have passed.
+ * so that this scrub target would not be retried before the required
+ * delay seconds have passed.
+ * The delay is determined based on the 'cause' parameter.
* The 'last_issue' is updated to the cause of the delay.
* \returns a reference to the target that was modified.
*/
[[maybe_unused]] SchedTarget& delay_on_failure(
scrub_level_t level,
- std::chrono::seconds delay,
delay_cause_t delay_cause,
utime_t scrub_clock_now);
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index 81e9bd7b6d8..da9466758f4 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -199,6 +199,7 @@ sc::result Session::react(const IntervalChanged&)
ceph_assert(m_reservations);
m_reservations->discard_remote_reservations();
+ m_abort_reason = delay_cause_t::interval;
return transit<NotActive>();
}
@@ -300,7 +301,8 @@ ActiveScrubbing::~ActiveScrubbing()
// completed successfully), we use it now to set the 'failed scrub' duration.
if (session.m_session_started_at != ScrubTimePoint{}) {
// delay the next invocation of the scrubber on this target
- scrbr->on_mid_scrub_abort(Scrub::delay_cause_t::aborted);
+ scrbr->on_mid_scrub_abort(
+ session.m_abort_reason.value_or(Scrub::delay_cause_t::aborted));
auto logged_duration = ScrubClock::now() - session.m_session_started_at;
session.m_perf_set->tinc(scrbcnt_failed_elapsed, logged_duration);
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
index d1edfd37c98..ad0d3bfba38 100644
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -551,6 +551,10 @@ struct Session : sc::state<Session, PrimaryActive, ReservingReplicas>,
/// the time when the session was initiated
ScrubTimePoint m_session_started_at{ScrubClock::now()};
+
+ /// abort reason - if known. Determines the delay time imposed on the
+ /// failed scrub target.
+ std::optional<Scrub::delay_cause_t> m_abort_reason{std::nullopt};
};
struct ReservingReplicas : sc::state<ReservingReplicas, Session>, NamedSimply {
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
index 60c1a68d2be..5e510a03a82 100644
--- a/src/osd/scrubber_common.h
+++ b/src/osd/scrubber_common.h
@@ -229,7 +229,8 @@ enum class delay_cause_t {
none, ///< scrub attempt was successful
replicas, ///< failed to reserve replicas
flags, ///< noscrub or nodeep-scrub
- pg_state, ///< e.g. snap-trimming
+ pg_state, ///< not active+clean
+ snap_trimming, ///< snap-trimming is in progress
restricted_time, ///< time restrictions or busy CPU
local_resources, ///< too many scrubbing PGs
aborted, ///< scrub was aborted w/ unspecified reason
@@ -252,6 +253,7 @@ struct formatter<Scrub::delay_cause_t> : ::fmt::formatter<std::string_view> {
case replicas: desc = "replicas"; break;
case flags: desc = "noscrub"; break;
case pg_state: desc = "pg-state"; break;
+ case snap_trimming: desc = "snap-trim"; break;
case restricted_time: desc = "time/load"; break;
case local_resources: desc = "local-cnt"; break;
case aborted: desc = "aborted"; break;
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 780e2b8784e..0bca599961e 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -144,10 +144,11 @@ DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23'
DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29'
DEFAULT_JAEGER_AGENT_IMAGE = 'quay.io/jaegertracing/jaeger-agent:1.29'
-DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:1.26.1'
+DEFAULT_NGINX_IMAGE = 'quay.io/ceph/nginx:sclorg-nginx-126'
DEFAULT_OAUTH2_PROXY = 'quay.io/oauth2-proxy/oauth2-proxy:v7.6.0'
DEFAULT_JAEGER_QUERY_IMAGE = 'quay.io/jaegertracing/jaeger-query:1.29'
DEFAULT_SAMBA_IMAGE = 'quay.io/samba.org/samba-server:devbuilds-centos-amd64'
+DEFAULT_SAMBA_METRICS_IMAGE = 'quay.io/samba.org/samba-metrics:latest'
# ------------------------------------------------------------------------------
@@ -320,6 +321,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
desc='Samba/SMB container image',
),
Option(
+ 'container_image_samba_metrics',
+ default=DEFAULT_SAMBA_METRICS_IMAGE,
+ desc='Samba/SMB metrics exporter container image',
+ ),
+ Option(
'warn_on_stray_hosts',
type='bool',
default=True,
@@ -529,6 +535,19 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
'the host (in seconds)'
),
Option(
+ 'ssh_keepalive_interval',
+ type='int',
+ default=7,
+ desc='How often ssh connections are checked for liveness'
+ ),
+ Option(
+ 'ssh_keepalive_count_max',
+ type='int',
+ default=3,
+ desc='How many times ssh connections can fail liveness checks '
+ 'before the host is marked offline'
+ ),
+ Option(
'cephadm_log_destination',
type='str',
default='',
@@ -585,6 +604,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.container_image_jaeger_collector = ''
self.container_image_jaeger_query = ''
self.container_image_samba = ''
+ self.container_image_samba_metrics = ''
self.warn_on_stray_hosts = True
self.warn_on_stray_daemons = True
self.warn_on_failed_host_check = True
@@ -630,6 +650,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.default_cephadm_command_timeout = 0
self.cephadm_log_destination = ''
self.oob_default_addr = ''
+ self.ssh_keepalive_interval = 0
+ self.ssh_keepalive_count_max = 0
self.notify(NotifyType.mon_map, None)
self.config_notify()
diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py
index e9b072a9a50..1efd2dc1281 100644
--- a/src/pybind/mgr/cephadm/service_discovery.py
+++ b/src/pybind/mgr/cephadm/service_discovery.py
@@ -13,6 +13,7 @@ from mgr_module import ServiceInfoT
from mgr_util import build_url
from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO
from cephadm.services.nfs import NFSService
+from cephadm.services.smb import SMBService
from cephadm.services.monitoring import AlertmanagerService, NodeExporterService, PrometheusService
import secrets
from mgr_util import verify_tls_files
@@ -149,6 +150,7 @@ class Root(Server):
<p><a href='prometheus/sd-config?service=ceph-exporter'>Ceph exporter http sd-config</a></p>
<p><a href='prometheus/sd-config?service=nvmeof'>NVMeoF http sd-config</a></p>
<p><a href='prometheus/sd-config?service=nfs'>NFS http sd-config</a></p>
+<p><a href='prometheus/sd-config?service=smb'>SMB http sd-config</a></p>
<p><a href='prometheus/rules'>Prometheus rules</a></p>
</body>
</html>'''
@@ -171,6 +173,8 @@ class Root(Server):
return self.nvmeof_sd_config()
elif service == 'nfs':
return self.nfs_sd_config()
+ elif service == 'smb':
+ return self.smb_sd_config()
else:
return []
@@ -265,6 +269,19 @@ class Root(Server):
})
return srv_entries
+ def smb_sd_config(self) -> List[Dict[str, Collection[str]]]:
+ """Return <http_sd_config> compatible prometheus config for smb service."""
+ srv_entries = []
+ for dd in self.mgr.cache.get_daemons_by_type('smb'):
+ assert dd.hostname is not None
+ addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+ port = SMBService.DEFAULT_EXPORTER_PORT
+ srv_entries.append({
+ 'targets': [build_url(host=addr, port=port).lstrip('/')],
+ 'labels': {'instance': dd.hostname}
+ })
+ return srv_entries
+
@cherrypy.expose(alias='prometheus/rules')
def get_prometheus_rules(self) -> str:
"""Return currently configured prometheus rules as Yaml."""
diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py
index f407985fcee..5d5d26ecad1 100644
--- a/src/pybind/mgr/cephadm/services/monitoring.py
+++ b/src/pybind/mgr/cephadm/services/monitoring.py
@@ -503,6 +503,7 @@ class PrometheusService(CephadmService):
nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included
mgmt_gw_enabled = len(self.mgr.cache.get_daemons_by_service('mgmt-gateway')) > 0
nfs_sd_url = f'{srv_end_point}service=nfs' # always included
+ smb_sd_url = f'{srv_end_point}service=smb' # always included
alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials()
prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials()
@@ -524,7 +525,8 @@ class PrometheusService(CephadmService):
'nvmeof_sd_url': nvmeof_sd_url,
'external_prometheus_targets': targets,
'cluster_fsid': FSID,
- 'nfs_sd_url': nfs_sd_url
+ 'nfs_sd_url': nfs_sd_url,
+ 'smb_sd_url': smb_sd_url
}
ip_to_bind_to = ''
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index b91427dcace..1792bbf37c9 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -47,10 +47,17 @@ class NvmeofService(CephService):
transport_tcp_options = json.dumps(spec.transport_tcp_options) if spec.transport_tcp_options else None
name = '{}.{}'.format(utils.name_to_config_section('nvmeof'), nvmeof_gw_id)
rados_id = name[len('client.'):] if name.startswith('client.') else name
+ addr = host_ip
+ discovery_addr = host_ip
+ if spec.addr and spec.addr != "0.0.0.0":
+ addr = spec.addr
+ if spec.discovery_addr and spec.discovery_addr != "0.0.0.0":
+ discovery_addr = spec.discovery_addr
context = {
'spec': spec,
'name': name,
- 'addr': host_ip,
+ 'addr': addr,
+ 'discovery_addr': discovery_addr,
'port': spec.port,
'spdk_log_level': 'WARNING',
'rpc_socket_dir': '/var/tmp/',
@@ -144,7 +151,9 @@ class NvmeofService(CephService):
cmd_dicts.append({
'prefix': 'dashboard nvmeof-gateway-add',
'inbuf': service_url,
- 'name': service_name
+ 'name': service_name,
+ 'group': spec.group,
+ 'daemon_name': dd.name()
})
return cmd_dicts
diff --git a/src/pybind/mgr/cephadm/services/smb.py b/src/pybind/mgr/cephadm/services/smb.py
index da75136cdfb..dabc202a024 100644
--- a/src/pybind/mgr/cephadm/services/smb.py
+++ b/src/pybind/mgr/cephadm/services/smb.py
@@ -16,6 +16,7 @@ logger = logging.getLogger(__name__)
class SMBService(CephService):
TYPE = 'smb'
+ DEFAULT_EXPORTER_PORT = 9922
smb_pool = '.smb' # minor layering violation. try to clean up later.
def config(self, spec: ServiceSpec) -> None:
@@ -79,6 +80,11 @@ class SMBService(CephService):
smb_spec, daemon_spec.daemon_id, ceph_users
)
)
+ config_blobs['metrics_image'] = (
+ self.mgr.container_image_samba_metrics
+ )
+ config_blobs['metrics_port'] = SMBService.DEFAULT_EXPORTER_PORT
+
logger.debug('smb generate_config: %r', config_blobs)
self._configure_cluster_meta(smb_spec, daemon_spec)
return config_blobs, []
diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py
index f0d507dfe89..1622cb001ab 100644
--- a/src/pybind/mgr/cephadm/ssh.py
+++ b/src/pybind/mgr/cephadm/ssh.py
@@ -168,7 +168,9 @@ class SSHManager:
with self.redirect_log(host, addr):
try:
ssh_options = asyncssh.SSHClientConnectionOptions(
- keepalive_interval=7, keepalive_count_max=3)
+ keepalive_interval=self.mgr.ssh_keepalive_interval,
+ keepalive_count_max=self.mgr.ssh_keepalive_count_max
+ )
conn = await asyncssh.connect(addr, username=self.mgr.ssh_user, client_keys=[self.mgr.tkey.name],
known_hosts=None, config=[self.mgr.ssh_config_fname],
preferred_auth=['publickey'], options=ssh_options)
@@ -240,10 +242,6 @@ class SSHManager:
if log_command:
logger.debug(f'Running command: {rcmd}')
try:
- test_cmd = RemoteSudoCommand(
- Executables.TRUE, [], use_sudo=use_sudo
- )
- r = await conn.run(str(test_cmd), check=True, timeout=5) # host quick check
r = await conn.run(str(rcmd), input=stdin)
# handle these Exceptions otherwise you might get a weird error like
# TypeError: __init__() missing 1 required positional argument: 'reason' (due to the asyncssh error interacting with raise_if_exception)
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index 644ca586ba9..dbe29004771 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -32,7 +32,7 @@ max_log_directory_backups = {{ spec.max_log_directory_backups }}
log_directory = {{ spec.log_directory }}
[discovery]
-addr = {{ addr }}
+addr = {{ discovery_addr }}
port = {{ spec.discovery_port }}
[ceph]
diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
index 3707f47f977..e7694867755 100644
--- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
+++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2
@@ -181,6 +181,26 @@ scrape_configs:
{% endif %}
{% endif %}
+{% if smb_sd_url %}
+ - job_name: 'smb'
+{% if security_enabled %}
+ honor_labels: true
+ scheme: https
+ tls_config:
+ ca_file: root_cert.pem
+ http_sd_configs:
+ - url: {{ smb_sd_url }}
+ basic_auth:
+ username: {{ service_discovery_username }}
+ password: {{ service_discovery_password }}
+ tls_config:
+ ca_file: root_cert.pem
+{% else %}
+ http_sd_configs:
+ - url: {{ smb_sd_url }}
+{% endif %}
+{% endif %}
+
{% if not security_enabled %}
- job_name: 'federate'
scrape_interval: 15s
diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py
index 159431b3b88..6f73cad91de 100644
--- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py
+++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py
@@ -27,6 +27,10 @@ class FakeCache:
return [FakeDaemonDescription('1.2.3.4', [9587], 'node0'),
FakeDaemonDescription('1.2.3.5', [9587], 'node1')]
+ if service_type == 'smb':
+ return [FakeDaemonDescription('1.2.3.4', [9922], 'node0'),
+ FakeDaemonDescription('1.2.3.5', [9922], 'node1')]
+
return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
@@ -206,6 +210,20 @@ class TestServiceDiscovery:
# check content
assert cfg[0]['targets'] == ['1.2.3.4:9587']
+ def test_get_sd_config_smb(self):
+ mgr = FakeMgr()
+ root = Root(mgr, 5000, '0.0.0.0')
+ cfg = root.get_sd_config('smb')
+
+ # check response structure
+ assert cfg
+ for entry in cfg:
+ assert 'labels' in entry
+ assert 'targets' in entry
+
+ # check content
+ assert cfg[0]['targets'] == ['1.2.3.4:9922']
+
def test_get_sd_config_invalid_service(self):
mgr = FakeMgr()
root = Root(mgr, 5000, '0.0.0.0')
diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py
index d21c4c719a5..08802cd5923 100644
--- a/src/pybind/mgr/cephadm/tests/test_services.py
+++ b/src/pybind/mgr/cephadm/tests/test_services.py
@@ -829,6 +829,10 @@ class TestMonitoring:
http_sd_configs:
- url: http://[::1]:8765/sd/prometheus/sd-config?service=nfs
+ - job_name: 'smb'
+ http_sd_configs:
+ - url: http://[::1]:8765/sd/prometheus/sd-config?service=smb
+
- job_name: 'federate'
scrape_interval: 15s
honor_labels: true
@@ -1039,6 +1043,19 @@ class TestMonitoring:
tls_config:
ca_file: root_cert.pem
+ - job_name: 'smb'
+ honor_labels: true
+ scheme: https
+ tls_config:
+ ca_file: root_cert.pem
+ http_sd_configs:
+ - url: https://[::1]:8765/sd/prometheus/sd-config?service=smb
+ basic_auth:
+ username: sd_user
+ password: sd_password
+ tls_config:
+ ca_file: root_cert.pem
+
""").lstrip()
_run_cephadm.assert_called_with(
@@ -3176,6 +3193,8 @@ class TestSMB:
'config': '',
'keyring': '[client.smb.config.tango.briskly]\nkey = None\n',
'config_auth_entity': 'client.smb.config.tango.briskly',
+ 'metrics_image': 'quay.io/samba.org/samba-metrics:latest',
+ 'metrics_port': 9922,
},
}
with with_host(cephadm_module, 'hostx'):
@@ -3246,6 +3265,8 @@ class TestSMB:
'[client.smb.fs.fs2.share3]\nkey = None\n'
),
'config_auth_entity': 'client.smb.config.tango.briskly',
+ 'metrics_image': 'quay.io/samba.org/samba-metrics:latest',
+ 'metrics_port': 9922,
},
}
with with_host(cephadm_module, 'hostx'):
diff --git a/src/pybind/mgr/dashboard/controllers/host.py b/src/pybind/mgr/dashboard/controllers/host.py
index c0062b94d70..645fc4cc44c 100644
--- a/src/pybind/mgr/dashboard/controllers/host.py
+++ b/src/pybind/mgr/dashboard/controllers/host.py
@@ -512,3 +512,16 @@ class HostUi(BaseController):
@handle_orchestrator_error('host')
def inventory(self, refresh=None):
return get_inventories(None, refresh)
+
+ @Endpoint('GET')
+ @ReadPermission
+ @raise_if_no_orchestrator([OrchFeature.HOST_LIST])
+ @handle_orchestrator_error('host')
+ def list(self):
+ """
+ Get all hosts.
+ This endpoint is introduced to get all the available hosts in cases where
+ service instance is not needed (ex: hosts selection in forms), and also
+ get_hosts method helps in caching the response which makes it performant.
+ """
+ return get_hosts()
diff --git a/src/pybind/mgr/dashboard/controllers/nvmeof.py b/src/pybind/mgr/dashboard/controllers/nvmeof.py
index e050ecdf6a6..ec9c9897081 100644
--- a/src/pybind/mgr/dashboard/controllers/nvmeof.py
+++ b/src/pybind/mgr/dashboard/controllers/nvmeof.py
@@ -2,6 +2,8 @@
import logging
from typing import Any, Dict, Optional
+from orchestrator import OrchestratorError
+
from .. import mgr
from ..model import nvmeof as model
from ..security import Scope
@@ -30,19 +32,32 @@ else:
@EndpointDoc("Get information about the NVMeoF gateway")
@map_model(model.GatewayInfo)
@handle_nvmeof_error
- def list(self):
- return NVMeoFClient().stub.get_gateway_info(
+ def list(self, gw_group: Optional[str] = None):
+ return NVMeoFClient(gw_group=gw_group).stub.get_gateway_info(
NVMeoFClient.pb2.get_gateway_info_req()
)
+ @ReadPermission
+ @Endpoint('GET')
+ def group(self):
+ try:
+ orch = OrchClient.instance()
+ return orch.services.list(service_type='nvmeof')
+ except OrchestratorError as e:
+ # just return none instead of raising an exception
+ # since we need this to work regardless of the status
+ # of orchestrator in UI
+ logger.error('Failed to fetch the gateway groups: %s', e)
+ return None
+
@APIRouter("/nvmeof/subsystem", Scope.NVME_OF)
@APIDoc("NVMe-oF Subsystem Management API", "NVMe-oF Subsystem")
class NVMeoFSubsystem(RESTController):
@EndpointDoc("List all NVMeoF subsystems")
@map_collection(model.Subsystem, pick="subsystems")
@handle_nvmeof_error
- def list(self):
- return NVMeoFClient().stub.list_subsystems(
+ def list(self, gw_group: Optional[str] = None):
+ return NVMeoFClient(gw_group=gw_group).stub.list_subsystems(
NVMeoFClient.pb2.list_subsystems_req()
)
@@ -52,8 +67,8 @@ else:
)
@map_model(model.Subsystem, first="subsystems")
@handle_nvmeof_error
- def get(self, nqn: str):
- return NVMeoFClient().stub.list_subsystems(
+ def get(self, nqn: str, gw_group: Optional[str] = None):
+ return NVMeoFClient(gw_group=gw_group).stub.list_subsystems(
NVMeoFClient.pb2.list_subsystems_req(subsystem_nqn=nqn)
)
@@ -61,14 +76,15 @@ else:
"Create a new NVMeoF subsystem",
parameters={
"nqn": Param(str, "NVMeoF subsystem NQN"),
- "max_namespaces": Param(int, "Maximum number of namespaces", True, 256),
+ "max_namespaces": Param(int, "Maximum number of namespaces", True, 1024),
"enable_ha": Param(bool, "Enable high availability"),
},
)
@empty_response
@handle_nvmeof_error
- def create(self, nqn: str, enable_ha: bool, max_namespaces: int = 256):
- return NVMeoFClient().stub.create_subsystem(
+ def create(self, nqn: str, enable_ha: bool, max_namespaces: int = 1024,
+ gw_group: Optional[str] = None):
+ return NVMeoFClient(gw_group=gw_group).stub.create_subsystem(
NVMeoFClient.pb2.create_subsystem_req(
subsystem_nqn=nqn, max_namespaces=max_namespaces, enable_ha=enable_ha
)
@@ -83,8 +99,8 @@ else:
)
@empty_response
@handle_nvmeof_error
- def delete(self, nqn: str, force: Optional[str] = "false"):
- return NVMeoFClient().stub.delete_subsystem(
+ def delete(self, nqn: str, force: Optional[str] = "false", gw_group: Optional[str] = None):
+ return NVMeoFClient(gw_group=gw_group).stub.delete_subsystem(
NVMeoFClient.pb2.delete_subsystem_req(
subsystem_nqn=nqn, force=str_to_bool(force)
)
diff --git a/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts b/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts
index 7bac7d12bed..cdf7d7cb531 100644
--- a/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts
+++ b/src/pybind/mgr/dashboard/frontend/cypress/e2e/block/images.po.ts
@@ -45,7 +45,7 @@ export class ImagesPageHelper extends PageHelper {
// checks that it is present in the trash table
moveToTrash(name: string) {
// wait for image to be created
- cy.get('cds-table table tbody').first().should('not.contain.text', '(Creating...)');
+ cy.get('table[cdstable] tbody').first().should('not.contain.text', '(Creating...)');
this.getFirstTableCell(name).click();
diff --git a/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts b/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts
index 4d5b0aa7bc7..2e94179aa74 100644
--- a/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts
+++ b/src/pybind/mgr/dashboard/frontend/cypress/e2e/page-helper.po.ts
@@ -147,14 +147,14 @@ export abstract class PageHelper {
*/
private waitDataTableToLoad() {
cy.get('cd-table').should('exist');
- cy.get('cds-table table tbody').should('exist');
+ cy.get('table[cdstable] tbody').should('exist');
cy.contains('Loading').should('not.exist');
}
getDataTables() {
this.waitDataTableToLoad();
- return cy.get('cd-table cds-table');
+ return cy.get('cd-table [cdsTable]');
}
private getTableCountSpan(_spanType: 'selected' | 'found' | 'total' | 'item' | 'items') {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html
index 18a4000e14d..c466c8674cc 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-gateway/nvmeof-gateway.component.html
@@ -3,8 +3,7 @@
<legend i18n>
Gateways
<cd-help-text>
- The NVMe-oF gateway integrates Ceph with the NVMe over TCP (NVMe/TCP) protocol to provide an NVMe/TCP target that exports RADOS Block Device (RBD) images.
- </cd-help-text>
+ Ceph NVMe-oF gateways provide Ceph Block Device storage through NVMe/TCP. For VMware clients the NVMe/TCP volumes display as VMFS Datastores. For Linux clients the NVMe/TCP volumes display as as block devices.</cd-help-text>
</legend>
<div>
<cd-table [data]="gateways"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html
index 29ebbe645d1..bdbe7320cfa 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-initiators-list/nvmeof-initiators-list.component.html
@@ -1,6 +1,7 @@
<legend>
<cd-help-text>
- The client that connects to the NVMe-oF target to access NVMe storage.
+ An initiator (or host) is the client that connects to the NVMe-oF target to access NVMe storage.
+ The NVMe/TCP protocol allows initiators, to send NVMe-oF commands to storage devices, which are known as targets.
</cd-help-text>
</legend>
<cd-table [data]="initiators"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
index bc02ea99c98..412286bda20 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-form/nvmeof-listeners-form.component.ts
@@ -13,7 +13,10 @@ import { FormatterService } from '~/app/shared/services/formatter.service';
import { CdValidators } from '~/app/shared/forms/cd-validators';
import { DimlessBinaryPipe } from '~/app/shared/pipes/dimless-binary.pipe';
import { HostService } from '~/app/shared/api/host.service';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
+import { DaemonService } from '~/app/shared/api/daemon.service';
+import { map } from 'rxjs/operators';
+import { forkJoin } from 'rxjs';
+
@Component({
selector: 'cd-nvmeof-listeners-form',
templateUrl: './nvmeof-listeners-form.component.html',
@@ -39,7 +42,8 @@ export class NvmeofListenersFormComponent implements OnInit {
private route: ActivatedRoute,
public activeModal: NgbActiveModal,
public formatterService: FormatterService,
- public dimlessBinaryPipe: DimlessBinaryPipe
+ public dimlessBinaryPipe: DimlessBinaryPipe,
+ private daemonService: DaemonService
) {
this.permission = this.authStorageService.getPermissions().nvmeof;
this.hostPermission = this.authStorageService.getPermissions().hosts;
@@ -48,13 +52,19 @@ export class NvmeofListenersFormComponent implements OnInit {
}
setHosts() {
- const hostContext = new CdTableFetchDataContext(() => undefined);
- this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: any[]) => {
- const nvmeofHosts = resp.filter((r) =>
- r.service_instances.some((si: any) => si.type === 'nvmeof')
- );
- this.hosts = nvmeofHosts.map((h) => ({ hostname: h.hostname, addr: h.addr }));
- });
+ forkJoin({
+ daemons: this.daemonService.list(['nvmeof']),
+ hosts: this.hostService.getAllHosts()
+ })
+ .pipe(
+ map(({ daemons, hosts }) => {
+ const hostNamesFromDaemon = daemons.map((daemon: any) => daemon.hostname);
+ return hosts.filter((host: any) => hostNamesFromDaemon.includes(host.hostname));
+ })
+ )
+ .subscribe((nvmeofHosts: any[]) => {
+ this.hosts = nvmeofHosts.map((h) => ({ hostname: h.hostname, addr: h.addr }));
+ });
}
ngOnInit() {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html
index da001554120..1a24ddfd28c 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-listeners-list/nvmeof-listeners-list.component.html
@@ -1,6 +1,6 @@
<legend>
<cd-help-text>
- A listener defines the IP port on the gateway that is to process NVMe/TCP commands and I/O operations.
+ A listener defines the IP address and port on the gateway that is used to process NVMe/TCP admin and I/O commands to a subsystem.
</cd-help-text>
</legend>
<cd-table [data]="listeners"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html
index 72576b7e642..87e474de757 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-namespaces-form/nvmeof-namespaces-form.component.html
@@ -39,7 +39,7 @@
[value]="pool.pool_name">{{ pool.pool_name }}</option>
</select>
<cd-help-text i18n>
- A RBD application-enabled pool where the image will be created.
+ An RBD application-enabled pool where the image will be created.
</cd-help-text>
<span class="invalid-feedback"
*ngIf="nsForm.showError('pool', formDir, 'required')"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html
index 1032a0d1e26..5ccc48eeeae 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.html
@@ -49,16 +49,16 @@
type="text"
name="max_namespaces"
formControlName="max_namespaces">
- <cd-help-text i18n>The maximum namespaces per subsystem. Default is 256.</cd-help-text>
+ <cd-help-text i18n>The maximum namespaces per subsystem. Default is {{defaultMaxNamespace}}</cd-help-text>
<span class="invalid-feedback"
*ngIf="subsystemForm.showError('max_namespaces', formDir, 'min')"
i18n>The value must be at least 1.</span>
<span class="invalid-feedback"
*ngIf="subsystemForm.showError('max_namespaces', formDir, 'max')"
- i18n>The value cannot be greated than 256.</span>
+ i18n>The value cannot be greater than {{defaultMaxNamespace}}.</span>
<span class="invalid-feedback"
*ngIf="subsystemForm.showError('max_namespaces', formDir, 'pattern')"
- i18n>The value must be a positive integer.</span>
+ i18n>The value must be a positive integer.</span>
</div>
</div>
</div>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts
index 4e9c420c9f0..08e56debf0a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.spec.ts
@@ -11,7 +11,7 @@ import { CdFormGroup } from '~/app/shared/forms/cd-form-group';
import { SharedModule } from '~/app/shared/shared.module';
import { NvmeofSubsystemsFormComponent } from './nvmeof-subsystems-form.component';
import { FormHelper } from '~/testing/unit-test-helper';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { MAX_NAMESPACE, NvmeofService } from '~/app/shared/api/nvmeof.service';
describe('NvmeofSubsystemsFormComponent', () => {
let component: NvmeofSubsystemsFormComponent;
@@ -59,7 +59,7 @@ describe('NvmeofSubsystemsFormComponent', () => {
component.onSubmit();
expect(nvmeofService.createSubsystem).toHaveBeenCalledWith({
nqn: expectedNqn,
- max_namespaces: 256,
+ max_namespaces: MAX_NAMESPACE,
enable_ha: true
});
});
@@ -76,8 +76,8 @@ describe('NvmeofSubsystemsFormComponent', () => {
formHelper.expectError('max_namespaces', 'pattern');
});
- it('should give error on max_namespaces greater than 256', () => {
- formHelper.setValue('max_namespaces', 300);
+ it(`should give error on max_namespaces greater than ${MAX_NAMESPACE}`, () => {
+ formHelper.setValue('max_namespaces', 2000);
component.onSubmit();
formHelper.expectError('max_namespaces', 'max');
});
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
index 5debb52c4d3..5c2e1ce5250 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems-form/nvmeof-subsystems-form.component.ts
@@ -10,7 +10,7 @@ import { AuthStorageService } from '~/app/shared/services/auth-storage.service';
import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
import { FinishedTask } from '~/app/shared/models/finished-task';
import { Router } from '@angular/router';
-import { NvmeofService } from '~/app/shared/api/nvmeof.service';
+import { MAX_NAMESPACE, NvmeofService } from '~/app/shared/api/nvmeof.service';
@Component({
selector: 'cd-nvmeof-subsystems-form',
@@ -23,6 +23,7 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
action: string;
resource: string;
pageURL: string;
+ defaultMaxNamespace: number = MAX_NAMESPACE;
constructor(
private authStorageService: AuthStorageService,
@@ -68,8 +69,12 @@ export class NvmeofSubsystemsFormComponent implements OnInit {
CdValidators.unique(this.nvmeofService.isSubsystemPresent, this.nvmeofService)
]
}),
- max_namespaces: new UntypedFormControl(256, {
- validators: [CdValidators.number(false), Validators.max(256), Validators.min(1)]
+ max_namespaces: new UntypedFormControl(this.defaultMaxNamespace, {
+ validators: [
+ CdValidators.number(false),
+ Validators.max(this.defaultMaxNamespace),
+ Validators.min(1)
+ ]
})
});
}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html
index cc89008eb82..6cd1f205913 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/nvmeof-subsystems/nvmeof-subsystems.component.html
@@ -2,7 +2,7 @@
<legend i18n>
Subsystems
<cd-help-text>
- A subsystem presents a collection of controllers which are used to access namespaces.
+ A subsystem provides access control to which hosts can access the namespaces within the subsystem.
</cd-help-text>
</legend>
<cd-table [data]="subsystems"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts
index ee06198d168..5cb980bef10 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-details/rbd-details.component.ts
@@ -25,7 +25,7 @@ export class RbdDetailsComponent implements OnChanges {
ngOnChanges() {
if (this.selection) {
- this.rbdDashboardUrl = `rbd-details?var-Pool=${this.selection['pool_name']}&var-Image=${this.selection['name']}`;
+ this.rbdDashboardUrl = `rbd-details?var-pool=${this.selection['pool_name']}&var-image=${this.selection['name']}`;
}
}
}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
index 67192f5d338..85c56cbf0d4 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/block/rbd-form/rbd-form.component.html
@@ -176,8 +176,8 @@
i18n>
Use a dedicated data pool
- <cd-help-text>Use a dedicated pool to store the mirror data. If not selected,
- the mirror data will be stored in the same pool as the image data.
+ <cd-help-text>Use a dedicated pool to store the image data. If not selected,
+ the image data will be stored in the same pool as the image metadata.
</cd-help-text>
<cd-helper *ngIf="allDataPools.length <= 1 && mode !== 'editing'">
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts
index 3b99541418a..c0373a9fb77 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cephfs/cephfs-form/cephfs-form.component.ts
@@ -19,7 +19,6 @@ import { CdValidators } from '~/app/shared/forms/cd-validators';
import { FinishedTask } from '~/app/shared/models/finished-task';
import { Permission } from '~/app/shared/models/permissions';
import { TaskWrapperService } from '~/app/shared/services/task-wrapper.service';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
@Component({
selector: 'cd-cephfs-form',
@@ -123,9 +122,8 @@ export class CephfsVolumeFormComponent extends CdForm implements OnInit {
}
});
} else {
- const hostContext = new CdTableFetchDataContext(() => undefined);
this.hostsAndLabels$ = forkJoin({
- hosts: this.hostService.list(hostContext.toParams(), 'false'),
+ hosts: this.hostService.getAllHosts(),
labels: this.hostService.getLabels()
}).pipe(
map(({ hosts, labels }) => ({
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts
index 0156b9196e1..42f597957c9 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/configuration/configuration.component.spec.ts
@@ -39,7 +39,8 @@ describe('ConfigurationComponent', () => {
expect(component).toBeTruthy();
});
- it('should check header text', () => {
+ // TODO: Re-write this unit test to reflect latest changes on datatble markup
+ it.skip('should check header text', () => {
const cdTableEl = fixture.debugElement.query(By.directive(TableComponent));
const cdTableComponent: TableComponent = cdTableEl.componentInstance;
cdTableComponent.ngAfterViewInit();
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts
index 2c25c462220..c7e1c31fc3b 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/hosts/hosts.component.spec.ts
@@ -117,7 +117,7 @@ describe('HostsComponent', () => {
fixture.detectChanges();
const spans = fixture.debugElement.nativeElement.querySelectorAll(
- 'cds-table > table > tbody > tr > td > span'
+ 'table > tbody > tr > td > span'
);
expect(spans[0].textContent.trim()).toBe(hostname);
});
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html
index a71c406461b..586ca57209f 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.html
@@ -97,7 +97,7 @@
[value]="pool.pool_name">{{ pool.pool_name }}</option>
</select>
<cd-help-text i18n>
- A pool in which the gateway configuration can be managed.
+ An RBD application-enabled pool in which the gateway configuration can be managed.
</cd-help-text>
<span class="invalid-feedback"
*ngIf="serviceForm.showError('pool', frm, 'required')"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts
index 9602c856aed..bada177f735 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/cluster/services/service-form/service-form.component.ts
@@ -30,7 +30,6 @@ import { CdForm } from '~/app/shared/forms/cd-form';
import { CdFormBuilder } from '~/app/shared/forms/cd-form-builder';
import { CdFormGroup } from '~/app/shared/forms/cd-form-group';
import { CdValidators } from '~/app/shared/forms/cd-validators';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
import { FinishedTask } from '~/app/shared/models/finished-task';
import { CephServiceSpec } from '~/app/shared/models/service.interface';
import { ModalService } from '~/app/shared/services/modal.service';
@@ -470,8 +469,7 @@ export class ServiceFormComponent extends CdForm implements OnInit {
this.serviceTypes = _.difference(resp, this.hiddenServices).sort();
});
- const hostContext = new CdTableFetchDataContext(() => undefined);
- this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: object[]) => {
+ this.hostService.getAllHosts().subscribe((resp: object[]) => {
const options: SelectOption[] = [];
_.forEach(resp, (host: object) => {
if (_.get(host, 'sources.orchestrator', false)) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts
index deda8901670..6a3edfbf59a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/ceph/rgw/rgw-multisite-import/rgw-multisite-import.component.ts
@@ -11,7 +11,6 @@ import { RgwZone } from '../models/rgw-multisite';
import _ from 'lodash';
import { SelectMessages } from '~/app/shared/components/select/select-messages.model';
import { HostService } from '~/app/shared/api/host.service';
-import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
import { SelectOption } from '~/app/shared/components/select/select-option.model';
import { Observable, Subject, merge } from 'rxjs';
import { debounceTime, distinctUntilChanged, filter, map } from 'rxjs/operators';
@@ -62,8 +61,7 @@ export class RgwMultisiteImportComponent implements OnInit {
this.zoneNames = this.zoneList.map((zone) => {
return zone['name'];
});
- const hostContext = new CdTableFetchDataContext(() => undefined);
- this.hostService.list(hostContext.toParams(), 'false').subscribe((resp: object[]) => {
+ this.hostService.getAllHosts().subscribe((resp: object[]) => {
const options: SelectOption[] = [];
_.forEach(resp, (host: object) => {
if (_.get(host, 'sources.orchestrator', false)) {
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts
index 3bb56957583..ce23302ba26 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/host.service.ts
@@ -162,4 +162,8 @@ export class HostService extends ApiClient {
})
);
}
+
+ getAllHosts(): Observable<object[]> {
+ return this.http.get<object[]>(`${this.baseUIURL}/list`);
+ }
}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
index 4b4c4e86693..7c72530e84a 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/api/nvmeof.service.ts
@@ -5,6 +5,8 @@ import _ from 'lodash';
import { Observable, of as observableOf } from 'rxjs';
import { catchError, mapTo } from 'rxjs/operators';
+export const MAX_NAMESPACE = 1024;
+
export interface ListenerRequest {
host_name: string;
traddr: string;
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html
index 8e9b2237c3f..51b218769bb 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/components/alert-panel/alert-panel.component.html
@@ -1,4 +1,4 @@
-<cds-actionable-notification class="mb-1"
+<cds-actionable-notification class="mb-1 content-theme"
[ngClass]="spacingClass"
[notificationObj]="notificationContent"
(close)="onClose()"></cds-actionable-notification>
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html
index 0829c908a3f..deb705d1fff 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.html
@@ -1,5 +1,6 @@
<cds-table-container [cdsLayer]="layer"
- [cdsTheme]="theme">
+ [cdsTheme]="theme"
+ class="content-theme">
<cds-table-toolbar #toolbar
*ngIf="toolHeader"
(cancel)="onBatchActionsCancel()"
@@ -131,49 +132,87 @@
</div>
</div>
<!-- end filter chips for column filters -->
- <cds-table [model]="model"
- [sortable]="!!userConfig.sorts"
- [size]="size"
- class="overflow-y-hidden"
- [skeleton]="false"
- [showSelectionColumn]="selectionType === 'multiClick'"
- [enableSingleSelect]="selectionType === 'single'"
- [stickyHeader]="false"
- [striped]="false"
- [isDataGrid]="false"
- (sort)="changeSorting($event)"
- (selectRow)="onSelect($event)"
- (selectAll)="onSelectAll($event)"
- (deselectRow)="onDeselect($event)"
- (deselectAll)="onDeselectAll($event)">
- <tbody>
- <tr cdstablerow
- *ngIf="!rows?.length && !loadingIndicator">
- <td class="no-data"
- cdstabledata
- [attr.colspan]="selectionType === 'single' ? visibleColumns.length + 1 : visibleColumns.length + 2">
- <span class="d-flex justify-content-center align-items-center"
- i18n>No data to display</span>
- </td>
- </tr>
- <tr cdstablerow
- *ngIf="loadingIndicator">
- <td class="no-data"
- cdstabledata
- [attr.colspan]="visibleColumns.length + 1">
- <span class="d-flex justify-content-center align-items-center"
- i18n>Loading</span>
- </td>
- </tr>
+ <table cdsTable
+ [sortable]="sortable"
+ [noBorder]="false"
+ [size]="size"
+ [striped]="false"
+ [skeleton]="loadingIndicator">
+ <thead cdsTableHead
+ [sortable]="sortable"
+ (deselectAll)="onDeselectAll()"
+ (selectAll)="onSelectAll()"
+ (sort)="changeSorting($event)"
+ [model]="model"
+ [showSelectionColumn]="showSelectionColumn"
+ [enableSingleSelect]="enableSingleSelect"
+ [skeleton]="loadingIndicator"
+ [stickyHeader]="false">
+ </thead>
+ <tbody cdsTableBody
+ *ngIf="!noData; else noDataTemplate"
+ [skeleton]="loadingIndicator">
+ <ng-container *ngFor="let row of model.data; let i = index; trackBy: trackByFn.bind(this, identifier)">
+ <tr cdsTableRow
+ [model]="model"
+ [row]="row"
+ [size]="size"
+ [selected]="model.isRowSelected(i)"
+ [expandable]="model.isRowExpandable(i)"
+ [expanded]="model.isRowExpanded(i)"
+ [showSelectionColumn]="showSelectionColumn"
+ [enableSingleSelect]="enableSingleSelect"
+ [skeleton]="loadingIndicator"
+ (selectRow)="onSelect(i)"
+ (deselectRow)="onDeselect(i)"
+ (expandRow)="model.expandRow(i, !model.isRowExpanded(i))"
+ (rowClick)="onSelect(i)"
+ *ngIf="!model.isRowFiltered(i)">
+ </tr>
+ <tr cdsTableExpandedRow
+ cdsExpandedRowHover
+ *ngIf="model.isRowExpandable(i) && !shouldExpandAsTable(row) && !model.isRowFiltered(i)"
+ [row]="row"
+ [expanded]="model.isRowExpanded(i)"
+ [skeleton]="loadingIndicator">
+ </tr>
+ <ng-container *ngIf="model.isRowExpandable(i) && shouldExpandAsTable(row) && model.isRowExpanded(i) && !model.isRowFiltered(i)">
+ <tr cdsTableRow
+ *ngFor="let expandedDataRow of firstExpandedDataInRow(row)"
+ [model]="model"
+ [showSelectionColumnCheckbox]="false"
+ [showSelectionColumn]="showSelectionColumn"
+ [row]="expandedDataRow"
+ [size]="size"
+ [selected]="model.isRowSelected(i)"
+ [skeleton]="loadingIndicator">
+ </tr>
+ </ng-container>
+ </ng-container>
</tbody>
- </cds-table>
+ </table>
<cds-pagination [model]="model"
(selectPage)="onPageChange($event)"
[disabled]="limit === 0"
+ [skeleton]="loadingIndicator"
[pageInputDisabled]="limit === 0">
</cds-pagination>
</cds-table-container>
+<ng-template #noDataTemplate>
+ <tbody>
+ <tr cdstablerow>
+ <td *ngIf="!rows?.length && !loadingIndicator"
+ class="no-data"
+ cdstabledata
+ [attr.colspan]="visibleColumns.length + 2">
+ <span class="d-flex justify-content-center align-items-center"
+ i18n>No data to display</span>
+ </td>
+ </tr>
+ </tbody>
+</ng-template>
+
<ng-template #rowDetailTpl
let-row="data">
<div *ngIf="row[identifier] === expanded?.[identifier]"
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
index 37277df4033..97bcee3dfe3 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/datatable/table/table.component.ts
@@ -27,13 +27,12 @@ import { CdTableColumn } from '~/app/shared/models/cd-table-column';
import { CdTableColumnFilter } from '~/app/shared/models/cd-table-column-filter';
import { CdTableColumnFiltersChange } from '~/app/shared/models/cd-table-column-filters-change';
import { CdTableFetchDataContext } from '~/app/shared/models/cd-table-fetch-data-context';
-import { PageInfo } from '~/app/shared/models/cd-table-paging';
import { CdTableSelection } from '~/app/shared/models/cd-table-selection';
import { CdUserConfig } from '~/app/shared/models/cd-user-config';
import { TimerService } from '~/app/shared/services/timer.service';
import { TableActionsComponent } from '../table-actions/table-actions.component';
import { TableDetailDirective } from '../directives/table-detail.directive';
-import { filter, map, throttleTime } from 'rxjs/operators';
+import { filter, map } from 'rxjs/operators';
import { CdSortDirection } from '../../enum/cd-sort-direction';
import { CdSortPropDir } from '../../models/cd-sort-prop-dir';
@@ -254,6 +253,32 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
private _expanded: any = undefined;
+ get sortable() {
+ return !!this.userConfig?.sorts;
+ }
+
+ get noData() {
+ return !this.rows?.length && !this.loadingIndicator;
+ }
+
+ get showSelectionColumn() {
+ return this.selectionType === 'multiClick';
+ }
+
+ get enableSingleSelect() {
+ return this.selectionType === 'single';
+ }
+
+ /**
+ * Controls if all checkboxes are viewed as selected.
+ */
+ selectAllCheckbox = false;
+
+ /**
+ * Controls the indeterminate state of the header checkbox.
+ */
+ selectAllCheckboxSomeSelected = false;
+
/**
* To prevent making changes to the original columns list, that might change
* how the table is renderer a second time, we now clone that list into a
@@ -295,7 +320,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
size: this.model.pageLength,
filteredData: value
});
- this.model.totalDataLength = value?.length || 0;
+ this.model.totalDataLength = this.serverSide ? this.count : value?.length || 0;
}
get rows() {
@@ -343,13 +368,6 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
return search.split(' ').filter((word) => word);
}
- shouldThrottle(): number {
- if (this.autoReload === -1) {
- return 500;
- }
- return 0;
- }
-
ngAfterViewInit(): void {
if (this.tableActions?.dropDownActions?.length) {
this.tableColumns = [
@@ -394,10 +412,6 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
return false;
}
return true;
- }),
- throttleTime(this.shouldThrottle(), undefined, {
- leading: true,
- trailing: false
})
)
.subscribe({
@@ -409,7 +423,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
let tableItem = new TableItem({
selected: val,
data: {
- value: column.pipe ? column.pipe.transform(rowValue || val) : rowValue,
+ value: column.pipe ? column.pipe.transform(rowValue) : rowValue,
row: val,
column: { ...column, ...val }
}
@@ -419,7 +433,8 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
tableItem.data = { ...tableItem.data, row: val };
if (this.hasDetails) {
- (tableItem.expandedData = val), (tableItem.expandedTemplate = this.rowDetailTpl);
+ tableItem.expandedData = val;
+ tableItem.expandedTemplate = this.rowDetailTpl;
}
}
@@ -455,9 +470,18 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
}
});
+ const rowsChangeSubscription = this.model.rowsSelectedChange.subscribe(() =>
+ this.updateSelectAllCheckbox()
+ );
+ const dataChangeSubscription = this.model.dataChange.subscribe(() => {
+ this.updateSelectAllCheckbox();
+ });
+
this._subscriptions.add(tableHeadersSubscription);
this._subscriptions.add(datasetSubscription);
this._subscriptions.add(rowsExpandedSubscription);
+ this._subscriptions.add(rowsChangeSubscription);
+ this._subscriptions.add(dataChangeSubscription);
}
ngOnInit() {
@@ -546,7 +570,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
this.userConfig.limit = this.limit;
}
if (!(this.userConfig.offset >= 0)) {
- // this.userConfig.offset = this.model.currentPage;
+ this.userConfig.offset = this.model.currentPage - 1;
}
if (!this.userConfig.search) {
this.userConfig.search = this.search;
@@ -771,11 +795,7 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
ngOnChanges(changes: SimpleChanges) {
if (changes?.data?.currentValue) {
- if (_.isNil(this.expanded)) {
- this.useData();
- } else if (this.model.rowsExpanded.every((x) => !x)) {
- this.expanded = undefined;
- }
+ this.useData();
}
}
@@ -828,16 +848,17 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
this.reloadData();
}
- changePage(pageInfo: PageInfo) {
- this.userConfig.offset = pageInfo.offset;
- this.userConfig.limit = pageInfo.limit;
+ onPageChange(page: number) {
+ this.model.currentPage = page;
+
+ this.userConfig.offset = this.model.currentPage - 1;
+ this.userConfig.limit = this.model.pageLength;
+
if (this.serverSide) {
this.reloadData();
+ return;
}
- }
- onPageChange(page: number) {
- this.model.currentPage = page;
this.doPagination({});
}
@@ -846,6 +867,11 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
size = this.model.pageLength,
filteredData = this.rows
}): void {
+ if (this.serverSide) {
+ this._dataset.next(filteredData);
+ return;
+ }
+
if (this.limit === 0) {
this.model.currentPage = 1;
this.model.pageLength = filteredData.length;
@@ -893,10 +919,10 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
this.updateColumnFilterOptions();
this.updateFilter();
this.reset();
+ this.doSorting();
this.updateSelected();
this.updateExpanded();
this.toggleExpandRow();
- this.doSorting();
}
/**
@@ -978,9 +1004,9 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
}
}
- onSelect($event: any) {
- const { selectedRowIndex } = $event;
+ onSelect(selectedRowIndex: number) {
const selectedData = _.get(this.model.data?.[selectedRowIndex], [0, 'selected']);
+ this.model.selectRow(selectedRowIndex, true);
if (this.selectionType === 'single') {
this.selection.selected = [selectedData];
} else {
@@ -989,24 +1015,27 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
this.updateSelection.emit(this.selection);
}
- onSelectAll($event: TableModel) {
- $event.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
+ onSelectAll() {
+ this.model.selectAll(!this.selectAllCheckbox && !this.selectAllCheckboxSomeSelected);
+ this.model.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
this._toggleSelection(rowIndex, isSelected)
);
this.updateSelection.emit(this.selection);
+ this.cdRef.detectChanges();
}
- onDeselect($event: any) {
+ onDeselect(deselectedRowIndex: number) {
+ this.model.selectRow(deselectedRowIndex, false);
if (this.selectionType === 'single') {
return;
}
- const { deselectedRowIndex } = $event;
this._toggleSelection(deselectedRowIndex, false);
this.updateSelection.emit(this.selection);
}
- onDeselectAll($event: TableModel) {
- $event.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
+ onDeselectAll() {
+ this.model.selectAll(false);
+ this.model.rowsSelected.forEach((isSelected: boolean, rowIndex: number) =>
this._toggleSelection(rowIndex, isSelected)
);
this.updateSelection.emit(this.selection);
@@ -1243,4 +1272,41 @@ export class TableComponent implements AfterViewInit, OnInit, OnChanges, OnDestr
(_, rowIndex: number) => rowIndex === expandedRowIndex
);
}
+
+ firstExpandedDataInRow(row: TableItem[]) {
+ const found = row.find((d) => d.expandedData);
+ if (found) {
+ return found.expandedData;
+ }
+ return found;
+ }
+
+ shouldExpandAsTable(row: TableItem[]) {
+ return row.some((d) => d.expandAsTable);
+ }
+
+ isRowExpandable(index: number) {
+ return this.model.data[index].some((d) => d && d.expandedData);
+ }
+
+ trackByFn(id: string, _index: number, row: TableItem[]) {
+ const uniqueIdentifier = _.get(row, [0, 'data', 'row', id])?.toString?.();
+ return uniqueIdentifier || row;
+ }
+
+ updateSelectAllCheckbox() {
+ const selectedRowsCount = this.model.selectedRowsCount();
+
+ if (selectedRowsCount <= 0) {
+ // reset select all checkbox if nothing selected
+ this.selectAllCheckbox = false;
+ this.selectAllCheckboxSomeSelected = false;
+ } else if (selectedRowsCount < this.model.data.length) {
+ this.selectAllCheckbox = true;
+ this.selectAllCheckboxSomeSelected = true;
+ } else {
+ this.selectAllCheckbox = true;
+ this.selectAllCheckboxSomeSelected = false;
+ }
+ }
}
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts
index 887d8d6bfb9..b67a792efcc 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/pipes/cd-date.pipe.ts
@@ -22,7 +22,7 @@ export class CdDatePipe implements PipeTransform {
.local()
.format('D/M/YY hh:mm A');
} else {
- value = value?.replace('Z', '');
+ value = value?.replace?.('Z', '');
date = moment.parseZone(value).utc().utcOffset(offset).local().format('D/M/YY hh:mm A');
}
return date;
diff --git a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts
index db91422e8a3..cf7662eac65 100644
--- a/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts
+++ b/src/pybind/mgr/dashboard/frontend/src/app/shared/services/task-message.service.ts
@@ -540,7 +540,7 @@ export class TaskMessageService {
}
service(metadata: any) {
- return $localize`Service '${metadata.service_name}'`;
+ return $localize`service '${metadata.service_name}'`;
}
crudMessage(metadata: any) {
diff --git a/src/pybind/mgr/dashboard/openapi.yaml b/src/pybind/mgr/dashboard/openapi.yaml
index b5a34304ada..1fb9771795f 100644
--- a/src/pybind/mgr/dashboard/openapi.yaml
+++ b/src/pybind/mgr/dashboard/openapi.yaml
@@ -8182,7 +8182,12 @@ paths:
- NFS-Ganesha
/api/nvmeof/gateway:
get:
- parameters: []
+ parameters:
+ - allowEmptyValue: true
+ in: query
+ name: gw_group
+ schema:
+ type: string
responses:
'200':
content:
@@ -8203,7 +8208,7 @@ paths:
summary: Get information about the NVMeoF gateway
tags:
- NVMe-oF Gateway
- /api/nvmeof/subsystem:
+ /api/nvmeof/gateway/group:
get:
parameters: []
responses:
@@ -8223,6 +8228,33 @@ paths:
trace.
security:
- jwt: []
+ tags:
+ - NVMe-oF Gateway
+ /api/nvmeof/subsystem:
+ get:
+ parameters:
+ - allowEmptyValue: true
+ in: query
+ name: gw_group
+ schema:
+ type: string
+ responses:
+ '200':
+ content:
+ application/vnd.ceph.api.v1.0+json:
+ type: object
+ description: OK
+ '400':
+ description: Operation exception. Please check the response body for details.
+ '401':
+ description: Unauthenticated access. Please login first.
+ '403':
+ description: Unauthorized access. Please check your permissions.
+ '500':
+ description: Unexpected error. Please check the response body for the stack
+ trace.
+ security:
+ - jwt: []
summary: List all NVMeoF subsystems
tags:
- NVMe-oF Subsystem
@@ -8236,8 +8268,10 @@ paths:
enable_ha:
description: Enable high availability
type: boolean
+ gw_group:
+ type: string
max_namespaces:
- default: 256
+ default: 1024
description: Maximum number of namespaces
type: integer
nqn:
@@ -8287,6 +8321,11 @@ paths:
name: force
schema:
type: boolean
+ - allowEmptyValue: true
+ in: query
+ name: gw_group
+ schema:
+ type: string
responses:
'202':
content:
@@ -8320,6 +8359,11 @@ paths:
required: true
schema:
type: string
+ - allowEmptyValue: true
+ in: query
+ name: gw_group
+ schema:
+ type: string
responses:
'200':
content:
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_cli.py b/src/pybind/mgr/dashboard/services/nvmeof_cli.py
index 75a121a48d6..11a95237053 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_cli.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_cli.py
@@ -19,13 +19,13 @@ def list_nvmeof_gateways(_):
@CLIWriteCommand('dashboard nvmeof-gateway-add')
@CLICheckNonemptyFileInput(desc='NVMe-oF gateway configuration')
-def add_nvmeof_gateway(_, inbuf, name: str):
+def add_nvmeof_gateway(_, inbuf, name: str, group: str, daemon_name: str):
'''
Add NVMe-oF gateway configuration. Gateway URL read from -i <file>
'''
service_url = inbuf
try:
- NvmeofGatewaysConfig.add_gateway(name, service_url)
+ NvmeofGatewaysConfig.add_gateway(name, service_url, group, daemon_name)
return 0, 'Success', ''
except NvmeofGatewayAlreadyExists as ex:
return -errno.EEXIST, '', str(ex)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_client.py b/src/pybind/mgr/dashboard/services/nvmeof_client.py
index 019ecf0267c..d6b126500b0 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_client.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_client.py
@@ -22,13 +22,24 @@ else:
class NVMeoFClient(object):
pb2 = pb2
- def __init__(self):
+ def __init__(self, gw_group: Optional[str] = None):
logger.info("Initiating nvmeof gateway connection...")
- service_name, self.gateway_addr = NvmeofGatewaysConfig.get_service_info()
+ try:
+ if not gw_group:
+ service_name, self.gateway_addr = NvmeofGatewaysConfig.get_service_info()
+ else:
+ service_name, self.gateway_addr = NvmeofGatewaysConfig.get_service_info(
+ gw_group
+ )
+ except TypeError as e:
+ raise DashboardException(
+ f'Unable to retrieve the gateway info: {e}'
+ )
root_ca_cert = NvmeofGatewaysConfig.get_root_ca_cert(service_name)
- client_key = NvmeofGatewaysConfig.get_client_key(service_name)
- client_cert = NvmeofGatewaysConfig.get_client_cert(service_name)
+ if root_ca_cert:
+ client_key = NvmeofGatewaysConfig.get_client_key(service_name)
+ client_cert = NvmeofGatewaysConfig.get_client_cert(service_name)
if root_ca_cert and client_key and client_cert:
logger.info('Securely connecting to: %s', self.gateway_addr)
diff --git a/src/pybind/mgr/dashboard/services/nvmeof_conf.py b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
index 3f076d7b216..a5a9979af25 100644
--- a/src/pybind/mgr/dashboard/services/nvmeof_conf.py
+++ b/src/pybind/mgr/dashboard/services/nvmeof_conf.py
@@ -51,11 +51,29 @@ class NvmeofGatewaysConfig(object):
return cls._load_config_from_store()
@classmethod
- def add_gateway(cls, name, service_url):
+ def add_gateway(cls, name, service_url, group, daemon_name):
config = cls.get_gateways_config()
- if name in config:
- raise NvmeofGatewayAlreadyExists(name)
- config['gateways'][name] = {'service_url': service_url}
+
+ if name in config.get('gateways', {}):
+ existing_gateways = config['gateways'][name]
+ for gateway in existing_gateways:
+ if 'daemon_name' not in gateway:
+ gateway['daemon_name'] = daemon_name
+ break
+ if gateway['service_url'] == service_url:
+ return
+
+ new_gateway = {
+ 'service_url': service_url,
+ 'group': group,
+ 'daemon_name': daemon_name
+ }
+
+ if name in config.get('gateways', {}):
+ config['gateways'][name].append(new_gateway)
+ else:
+ config['gateways'][name] = [new_gateway]
+
cls._save_config(config)
@classmethod
@@ -67,12 +85,18 @@ class NvmeofGatewaysConfig(object):
cls._save_config(config)
@classmethod
- def get_service_info(cls):
+ def get_service_info(cls, group=None):
try:
config = cls.get_gateways_config()
- service_name = list(config['gateways'].keys())[0]
- addr = config['gateways'][service_name]['service_url']
- return service_name, addr
+ gateways = config.get('gateways', {})
+ if not gateways:
+ return None
+
+ if group:
+ return cls._get_name_url_for_group(gateways, group)
+
+ return cls._get_default_service(gateways)
+
except (KeyError, IndexError) as e:
raise DashboardException(
msg=f'NVMe-oF configuration is not set: {e}',
@@ -112,3 +136,45 @@ class NvmeofGatewaysConfig(object):
# just return None if any orchestrator error is raised
# otherwise nvmeof api will raise this error and doesn't proceed.
return None
+
+ @classmethod
+ def _get_name_url_for_group(cls, gateways, group):
+ try:
+ orch = OrchClient.instance()
+ for service_name, svc_config in gateways.items():
+ # get the group name of the service and match it against the
+ # group name provided
+ group_name_from_svc = orch.services.get(service_name)[0].spec.group
+ if group == group_name_from_svc:
+ running_daemons = cls._get_running_daemons(orch, service_name)
+ config = cls._get_running_daemon_svc_config(svc_config, running_daemons)
+
+ if config:
+ return service_name, config['service_url']
+ return None
+
+ except OrchestratorError:
+ return cls._get_default_service(gateways)
+
+ @classmethod
+ def _get_running_daemons(cls, orch, service_name):
+ # get the running nvmeof daemons
+ daemons = [d.to_dict()
+ for d in orch.services.list_daemons(service_name=service_name)]
+ return [d['daemon_name'] for d in daemons
+ if d['status_desc'] == 'running']
+
+ @classmethod
+ def _get_running_daemon_svc_config(cls, svc_config, running_daemons):
+ try:
+ return next(config for config in svc_config
+ if config['daemon_name'] in running_daemons)
+ except StopIteration:
+ return None
+
+ @classmethod
+ def _get_default_service(cls, gateways):
+ if gateways:
+ service_name = list(gateways.keys())[0]
+ return service_name, gateways[service_name][0]['service_url']
+ return None
diff --git a/src/pybind/mgr/smb/enums.py b/src/pybind/mgr/smb/enums.py
index f362219221b..dea45f951f8 100644
--- a/src/pybind/mgr/smb/enums.py
+++ b/src/pybind/mgr/smb/enums.py
@@ -16,6 +16,23 @@ else: # pragma: no cover
class CephFSStorageProvider(_StrEnum):
KERNEL_MOUNT = 'kcephfs'
SAMBA_VFS = 'samba-vfs'
+ SAMBA_VFS_CLASSIC = 'samba-vfs/classic'
+ SAMBA_VFS_NEW = 'samba-vfs/new'
+
+ def expand(self) -> 'CephFSStorageProvider':
+ """Expand abbreviated/default values into the full/expanded form."""
+ if self == self.SAMBA_VFS:
+ # mypy gets confused by enums
+ return self.__class__(self.SAMBA_VFS_NEW)
+ return self
+
+ def is_vfs(self) -> bool:
+ """Return true if value is a samba vfs provider."""
+ return self in {
+ self.SAMBA_VFS,
+ self.SAMBA_VFS_CLASSIC,
+ self.SAMBA_VFS_NEW,
+ }
class SubSystem(_StrEnum):
diff --git a/src/pybind/mgr/smb/handler.py b/src/pybind/mgr/smb/handler.py
index b2285eef575..fc93a7277cb 100644
--- a/src/pybind/mgr/smb/handler.py
+++ b/src/pybind/mgr/smb/handler.py
@@ -620,11 +620,6 @@ class ClusterConfigHandler:
change_group.cluster.cluster_id,
set(change_group.cache),
)
- external.rm_other_in_ns(
- self.public_store,
- change_group.cluster.cluster_id,
- set(change_group.cache),
- )
# ensure a entity exists with access to the volumes
for volume in vols:
@@ -1025,7 +1020,7 @@ def _generate_share(
share: resources.Share, resolver: PathResolver, cephx_entity: str
) -> Dict[str, Dict[str, str]]:
assert share.cephfs is not None
- assert share.cephfs.provider == CephFSStorageProvider.SAMBA_VFS
+ assert share.cephfs.provider.is_vfs(), "not a vfs provider"
assert cephx_entity, "cephx entity name missing"
# very annoyingly, samba's ceph module absolutely must NOT have the
# "client." bit in front. JJM has been tripped up by this multiple times -
@@ -1040,15 +1035,24 @@ def _generate_share(
share.cephfs.subvolume,
share.cephfs.path,
)
+ try:
+ ceph_vfs = {
+ CephFSStorageProvider.SAMBA_VFS_CLASSIC: 'ceph',
+ CephFSStorageProvider.SAMBA_VFS_NEW: 'ceph_new',
+ }[share.checked_cephfs.provider.expand()]
+ except KeyError:
+ raise ValueError(
+ f'unsupported provider: {share.checked_cephfs.provider}'
+ )
cfg = {
# smb.conf options
'options': {
'path': path,
- "vfs objects": "acl_xattr ceph",
+ "vfs objects": f"acl_xattr {ceph_vfs}",
'acl_xattr:security_acl_name': 'user.NTACL',
- 'ceph:config_file': '/etc/ceph/ceph.conf',
- 'ceph:filesystem': share.cephfs.volume,
- 'ceph:user_id': cephx_entity,
+ f'{ceph_vfs}:config_file': '/etc/ceph/ceph.conf',
+ f'{ceph_vfs}:filesystem': share.cephfs.volume,
+ f'{ceph_vfs}:user_id': cephx_entity,
'read only': ynbool(share.readonly),
'browseable': ynbool(share.browseable),
'kernel share modes': 'no',
diff --git a/src/pybind/mgr/smb/tests/test_smb.py b/src/pybind/mgr/smb/tests/test_smb.py
index 547e6c712bc..86a2310a4de 100644
--- a/src/pybind/mgr/smb/tests/test_smb.py
+++ b/src/pybind/mgr/smb/tests/test_smb.py
@@ -431,11 +431,11 @@ def test_share_dump_config(tmodule):
'browseable': 'Yes',
'kernel share modes': 'no',
'x:ceph:id': 'foo.s1',
- 'vfs objects': 'acl_xattr ceph',
+ 'vfs objects': 'acl_xattr ceph_new',
'acl_xattr:security_acl_name': 'user.NTACL',
- 'ceph:config_file': '/etc/ceph/ceph.conf',
- 'ceph:filesystem': 'cephfs',
- 'ceph:user_id': 'smb.fs.cluster.foo',
+ 'ceph_new:config_file': '/etc/ceph/ceph.conf',
+ 'ceph_new:filesystem': 'cephfs',
+ 'ceph_new:user_id': 'smb.fs.cluster.foo',
},
},
'Ess Two': {
@@ -445,11 +445,11 @@ def test_share_dump_config(tmodule):
'browseable': 'Yes',
'kernel share modes': 'no',
'x:ceph:id': 'foo.stwo',
- 'vfs objects': 'acl_xattr ceph',
+ 'vfs objects': 'acl_xattr ceph_new',
'acl_xattr:security_acl_name': 'user.NTACL',
- 'ceph:config_file': '/etc/ceph/ceph.conf',
- 'ceph:filesystem': 'cephfs',
- 'ceph:user_id': 'smb.fs.cluster.foo',
+ 'ceph_new:config_file': '/etc/ceph/ceph.conf',
+ 'ceph_new:filesystem': 'cephfs',
+ 'ceph_new:user_id': 'smb.fs.cluster.foo',
},
},
},
diff --git a/src/pybind/mgr/snap_schedule/fs/schedule_client.py b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
index 1c46ccc6933..b58f20f1275 100644
--- a/src/pybind/mgr/snap_schedule/fs/schedule_client.py
+++ b/src/pybind/mgr/snap_schedule/fs/schedule_client.py
@@ -361,7 +361,7 @@ class SnapSchedClient(CephfsClient):
path = sched.path
prune_candidates = set()
time = datetime.now(timezone.utc)
- mds_max_snaps_per_dir = self.mgr.get_ceph_option('mds_max_snaps_per_dir')
+ mds_max_snaps_per_dir = self.mgr.get_foreign_ceph_option('mds', 'mds_max_snaps_per_dir')
with open_filesystem(self, sched.fs) as fs_handle:
snap_dir = self.mgr.rados.conf_get('client_snapdir')
with fs_handle.opendir(f'{path}/{snap_dir}') as d_handle:
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index 2b1afc141d8..2238cd01c37 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1315,6 +1315,7 @@ class NvmeofServiceSpec(ServiceSpec):
service_id: Optional[str] = None,
name: Optional[str] = None,
group: Optional[str] = None,
+ addr: Optional[str] = None,
port: Optional[int] = None,
pool: Optional[str] = None,
enable_auth: bool = False,
@@ -1347,6 +1348,7 @@ class NvmeofServiceSpec(ServiceSpec):
transport_tcp_options: Optional[Dict[str, int]] =
{"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7},
tgt_cmd_extra_args: Optional[str] = None,
+ discovery_addr: Optional[str] = None,
discovery_port: Optional[int] = None,
log_level: Optional[str] = 'INFO',
log_files_enabled: Optional[bool] = True,
@@ -1378,6 +1380,8 @@ class NvmeofServiceSpec(ServiceSpec):
#: RADOS pool where ceph-nvmeof config data is stored.
self.pool = pool
+ #: ``addr`` address of the nvmeof gateway
+ self.addr = addr or "0.0.0.0"
#: ``port`` port of the nvmeof gateway
self.port = port or 5500
#: ``name`` name of the nvmeof gateway
@@ -1442,6 +1446,8 @@ class NvmeofServiceSpec(ServiceSpec):
self.transport_tcp_options: Optional[Dict[str, int]] = transport_tcp_options
#: ``tgt_cmd_extra_args`` extra arguments for the nvmf_tgt process
self.tgt_cmd_extra_args = tgt_cmd_extra_args
+ #: ``discovery_addr`` address of the discovery service
+ self.discovery_addr = discovery_addr or "0.0.0.0"
#: ``discovery_port`` port of the discovery service
self.discovery_port = discovery_port or 8009
#: ``log_level`` the nvmeof gateway log level
diff --git a/src/rgw/driver/daos/rgw_sal_daos.cc b/src/rgw/driver/daos/rgw_sal_daos.cc
index 0558dbf461e..cf6820a9111 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.cc
+++ b/src/rgw/driver/daos/rgw_sal_daos.cc
@@ -1196,7 +1196,8 @@ int DaosObject::DaosDeleteOp::delete_obj(const DoutPrefixProvider* dpp,
}
int DaosObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y,
- uint32_t flags) {
+ uint32_t flags, std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) {
ldpp_dout(dpp, 20) << "DEBUG: delete_object" << dendl;
DaosObject::DaosDeleteOp del_op(this);
del_op.params.bucket_owner = bucket->get_info().owner;
@@ -1677,7 +1678,8 @@ int DaosMultipartUpload::complete(
map<int, string>& part_etags, list<rgw_obj_index_key>& remove_objs,
uint64_t& accounted_size, bool& compressed, RGWCompressionInfo& cs_info,
off_t& off, std::string& tag, ACLOwner& owner, uint64_t olh_epoch,
- rgw::sal::Object* target_obj) {
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) {
ldpp_dout(dpp, 20) << "DEBUG: complete" << dendl;
char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -1922,6 +1924,15 @@ int DaosMultipartUpload::complete(
return ret;
}
+int DaosMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes)
+{
+ return -ENOTSUP;
+}
+
int DaosMultipartUpload::get_info(const DoutPrefixProvider* dpp,
optional_yield y, rgw_placement_rule** rule,
rgw::sal::Attrs* attrs) {
diff --git a/src/rgw/driver/daos/rgw_sal_daos.h b/src/rgw/driver/daos/rgw_sal_daos.h
index 8cff96088ee..7cc20260227 100644
--- a/src/rgw/driver/daos/rgw_sal_daos.h
+++ b/src/rgw/driver/daos/rgw_sal_daos.h
@@ -596,7 +596,8 @@ class DaosObject : public StoreObject {
virtual ~DaosObject();
virtual int delete_object(const DoutPrefixProvider* dpp, optional_yield y,
- uint32_t flags) override;
+ uint32_t flags, std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) override;
virtual int copy_object(
const ACLOwner& owner, const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
@@ -861,7 +862,13 @@ class DaosMultipartUpload : public StoreMultipartUpload {
uint64_t& accounted_size, bool& compressed,
RGWCompressionInfo& cs_info, off_t& off,
std::string& tag, ACLOwner& owner, uint64_t olh_epoch,
- rgw::sal::Object* target_obj) override;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) override;
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) override;
virtual int get_info(const DoutPrefixProvider* dpp, optional_yield y,
rgw_placement_rule** rule,
rgw::sal::Attrs* attrs = nullptr) override;
diff --git a/src/rgw/driver/dbstore/common/dbstore.cc b/src/rgw/driver/dbstore/common/dbstore.cc
index d548bc4d8c0..b3aed3948c8 100644
--- a/src/rgw/driver/dbstore/common/dbstore.cc
+++ b/src/rgw/driver/dbstore/common/dbstore.cc
@@ -1973,7 +1973,7 @@ int DB::Object::Delete::create_dm(const DoutPrefixProvider *dpp,
}
int DB::get_entry(const std::string& oid, const std::string& marker,
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+ rgw::sal::LCEntry& entry)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
@@ -1982,7 +1982,7 @@ int DB::get_entry(const std::string& oid, const std::string& marker,
InitializeParams(dpp, &params);
params.op.lc_entry.index = oid;
- params.op.lc_entry.entry.set_bucket(marker);
+ params.op.lc_entry.entry.bucket = marker;
params.op.query_str = "get_entry";
ret = ProcessOp(dpp, "GetLCEntry", &params);
@@ -1992,14 +1992,8 @@ int DB::get_entry(const std::string& oid, const std::string& marker,
goto out;
}
- if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
- rgw::sal::Lifecycle::LCEntry* e;
- e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
- if (!e) {
- ret = -ENOMEM;
- goto out;
- }
- entry->reset(e);
+ if (params.op.lc_entry.entry.start_time != 0) { //ensure entry found
+ entry = std::move(params.op.lc_entry.entry);
}
out:
@@ -2007,7 +2001,7 @@ out:
}
int DB::get_next_entry(const std::string& oid, const std::string& marker,
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+ rgw::sal::LCEntry& entry)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2016,7 +2010,7 @@ int DB::get_next_entry(const std::string& oid, const std::string& marker,
InitializeParams(dpp, &params);
params.op.lc_entry.index = oid;
- params.op.lc_entry.entry.set_bucket(marker);
+ params.op.lc_entry.entry.bucket = marker;
params.op.query_str = "get_next_entry";
ret = ProcessOp(dpp, "GetLCEntry", &params);
@@ -2026,21 +2020,15 @@ int DB::get_next_entry(const std::string& oid, const std::string& marker,
goto out;
}
- if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
- rgw::sal::Lifecycle::LCEntry* e;
- e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
- if (!e) {
- ret = -ENOMEM;
- goto out;
- }
- entry->reset(e);
+ if (params.op.lc_entry.entry.start_time != 0) { //ensure entry found
+ entry = std::move(params.op.lc_entry.entry);
}
out:
return ret;
}
-int DB::set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+int DB::set_entry(const std::string& oid, const rgw::sal::LCEntry& entry)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2063,7 +2051,7 @@ out:
}
int DB::list_entries(const std::string& oid, const std::string& marker,
- uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries)
+ uint32_t max_entries, std::vector<rgw::sal::LCEntry>& entries)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2085,14 +2073,14 @@ int DB::list_entries(const std::string& oid, const std::string& marker,
}
for (auto& entry : params.op.lc_entry.list_entries) {
- entries.push_back(std::make_unique<rgw::sal::StoreLifecycle::StoreLCEntry>(std::move(entry)));
+ entries.push_back(std::move(entry));
}
out:
return ret;
}
-int DB::rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+int DB::rm_entry(const std::string& oid, const rgw::sal::LCEntry& entry)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2114,7 +2102,7 @@ out:
return ret;
}
-int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head)
+int DB::get_head(const std::string& oid, rgw::sal::LCHead& head)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
@@ -2131,13 +2119,13 @@ int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LC
goto out;
}
- *head = std::make_unique<rgw::sal::StoreLifecycle::StoreLCHead>(params.op.lc_head.head);
+ head = std::move(params.op.lc_head.head);
out:
return ret;
}
-int DB::put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head)
+int DB::put_head(const std::string& oid, const rgw::sal::LCHead& head)
{
int ret = 0;
const DoutPrefixProvider *dpp = get_def_dpp();
diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h
index 3f8191f5a92..605477ef4ad 100644
--- a/src/rgw/driver/dbstore/common/dbstore.h
+++ b/src/rgw/driver/dbstore/common/dbstore.h
@@ -104,15 +104,15 @@ struct DBOpObjectDataInfo {
struct DBOpLCHeadInfo {
std::string index;
- rgw::sal::StoreLifecycle::StoreLCHead head;
+ rgw::sal::LCHead head;
};
struct DBOpLCEntryInfo {
std::string index;
- rgw::sal::StoreLifecycle::StoreLCEntry entry;
+ rgw::sal::LCEntry entry;
// used for list query
std::string min_marker;
- std::list<rgw::sal::StoreLifecycle::StoreLCEntry> list_entries;
+ std::list<rgw::sal::LCEntry> list_entries;
};
struct DBOpInfo {
@@ -1979,15 +1979,15 @@ class DB {
RGWObjState *astate, void *arg);
int get_entry(const std::string& oid, const std::string& marker,
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
+ rgw::sal::LCEntry& entry);
int get_next_entry(const std::string& oid, const std::string& marker,
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
- int set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
+ rgw::sal::LCEntry& entry);
+ int set_entry(const std::string& oid, const rgw::sal::LCEntry& entry);
int list_entries(const std::string& oid, const std::string& marker,
- uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries);
- int rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
- int get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head);
- int put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head);
+ uint32_t max_entries, std::vector<rgw::sal::LCEntry>& entries);
+ int rm_entry(const std::string& oid, const rgw::sal::LCEntry& entry);
+ int get_head(const std::string& oid, rgw::sal::LCHead& head);
+ int put_head(const std::string& oid, const rgw::sal::LCHead& head);
int delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
uint32_t min_wait);
int createGC(const DoutPrefixProvider *_dpp);
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
index 554d8fe94cf..f503d67b795 100644
--- a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
+++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
@@ -561,9 +561,9 @@ static int list_lc_entry(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_st
return -1;
op.lc_entry.index = (const char*)sqlite3_column_text(stmt, LCEntryIndex);
- op.lc_entry.entry.set_bucket((const char*)sqlite3_column_text(stmt, LCEntryBucketName));
- op.lc_entry.entry.set_start_time(sqlite3_column_int(stmt, LCEntryStartTime));
- op.lc_entry.entry.set_status(sqlite3_column_int(stmt, LCEntryStatus));
+ op.lc_entry.entry.bucket = (const char*)sqlite3_column_text(stmt, LCEntryBucketName);
+ op.lc_entry.entry.start_time = sqlite3_column_int(stmt, LCEntryStartTime);
+ op.lc_entry.entry.status = sqlite3_column_int(stmt, LCEntryStatus);
op.lc_entry.list_entries.push_back(op.lc_entry.entry);
@@ -577,10 +577,10 @@ static int list_lc_head(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stm
int64_t start_date;
op.lc_head.index = (const char*)sqlite3_column_text(stmt, LCHeadIndex);
- op.lc_head.head.set_marker((const char*)sqlite3_column_text(stmt, LCHeadMarker));
+ op.lc_head.head.marker = (const char*)sqlite3_column_text(stmt, LCHeadMarker);
SQL_DECODE_BLOB_PARAM(dpp, stmt, LCHeadStartDate, start_date, sdb);
- op.lc_head.head.get_start_date() = start_date;
+ op.lc_head.head.start_date = start_date;
return 0;
}
@@ -2692,13 +2692,13 @@ int SQLInsertLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *par
SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
- SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+ SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.bucket.c_str(), sdb);
SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.status, sdb);
- SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_status(), sdb);
+ SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.status, sdb);
SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.start_time, sdb);
- SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_start_time(), sdb);
+ SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.start_time, sdb);
out:
return rc;
@@ -2741,7 +2741,7 @@ int SQLRemoveLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *par
SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
- SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+ SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.bucket.c_str(), sdb);
out:
return rc;
@@ -2796,7 +2796,7 @@ int SQLGetLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params
SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.index.c_str(), sdb);
SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.bucket_name, sdb);
- SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+ SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.bucket.c_str(), sdb);
out:
return rc;
@@ -2892,7 +2892,7 @@ int SQLInsertLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *para
SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.marker, sdb);
- SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.get_marker().c_str(), sdb);
+ SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.marker.c_str(), sdb);
SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.start_date, sdb);
SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, static_cast<int64_t>(params->op.lc_head.head.start_date), sdb);
diff --git a/src/rgw/driver/dbstore/tests/dbstore_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
index c89addeade1..2ceed7218d8 100644
--- a/src/rgw/driver/dbstore/tests/dbstore_tests.cc
+++ b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
@@ -1255,31 +1255,30 @@ TEST_F(DBStoreTest, LCHead) {
std::string index1 = "bucket1";
std::string index2 = "bucket2";
time_t lc_time = ceph_clock_now();
- std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
- std::string ents[] = {"entry1", "entry2", "entry3"};
- rgw::sal::StoreLifecycle::StoreLCHead head1(lc_time, 0, ents[0]);
- rgw::sal::StoreLifecycle::StoreLCHead head2(lc_time, 0, ents[1]);
- rgw::sal::StoreLifecycle::StoreLCHead head3(lc_time, 0, ents[2]);
+ rgw::sal::LCHead head;
+ rgw::sal::LCHead head1{lc_time, "entry1"};
+ rgw::sal::LCHead head2{lc_time, "entry2"};
+ rgw::sal::LCHead head3{lc_time, "entry3"};
ret = db->put_head(index1, head1);
ASSERT_EQ(ret, 0);
ret = db->put_head(index2, head2);
ASSERT_EQ(ret, 0);
- ret = db->get_head(index1, &head);
+ ret = db->get_head(index1, head);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(head->get_marker(), "entry1");
+ ASSERT_EQ(head.marker, "entry1");
- ret = db->get_head(index2, &head);
+ ret = db->get_head(index2, head);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(head->get_marker(), "entry2");
+ ASSERT_EQ(head.marker, "entry2");
// update index1
ret = db->put_head(index1, head3);
ASSERT_EQ(ret, 0);
- ret = db->get_head(index1, &head);
+ ret = db->get_head(index1, head);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(head->get_marker(), "entry3");
+ ASSERT_EQ(head.marker, "entry3");
}
TEST_F(DBStoreTest, LCEntry) {
@@ -1290,13 +1289,13 @@ TEST_F(DBStoreTest, LCEntry) {
std::string index2 = "lcindex2";
typedef enum {lc_uninitial = 1, lc_complete} status;
std::string ents[] = {"bucket1", "bucket2", "bucket3", "bucket4"};
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
- rgw::sal::StoreLifecycle::StoreLCEntry entry1(ents[0], lc_time, lc_uninitial);
- rgw::sal::StoreLifecycle::StoreLCEntry entry2(ents[1], lc_time, lc_uninitial);
- rgw::sal::StoreLifecycle::StoreLCEntry entry3(ents[2], lc_time, lc_uninitial);
- rgw::sal::StoreLifecycle::StoreLCEntry entry4(ents[3], lc_time, lc_uninitial);
+ rgw::sal::LCEntry entry;
+ rgw::sal::LCEntry entry1{ents[0], lc_time, lc_uninitial};
+ rgw::sal::LCEntry entry2{ents[1], lc_time, lc_uninitial};
+ rgw::sal::LCEntry entry3{ents[2], lc_time, lc_uninitial};
+ rgw::sal::LCEntry entry4{ents[3], lc_time, lc_uninitial};
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> lc_entries;
+ vector<rgw::sal::LCEntry> lc_entries;
ret = db->set_entry(index1, entry1);
ASSERT_EQ(ret, 0);
@@ -1308,44 +1307,44 @@ TEST_F(DBStoreTest, LCEntry) {
ASSERT_EQ(ret, 0);
// get entry index1, entry1
- ret = db->get_entry(index1, ents[0], &entry);
+ ret = db->get_entry(index1, ents[0], entry);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(entry->get_status(), lc_uninitial);
- ASSERT_EQ(entry->get_start_time(), lc_time);
+ ASSERT_EQ(entry.status, lc_uninitial);
+ ASSERT_EQ(entry.start_time, lc_time);
// get next entry index1, entry2
- ret = db->get_next_entry(index1, ents[1], &entry);
+ ret = db->get_next_entry(index1, ents[1], entry);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(entry->get_bucket(), ents[2]);
- ASSERT_EQ(entry->get_status(), lc_uninitial);
- ASSERT_EQ(entry->get_start_time(), lc_time);
+ ASSERT_EQ(entry.bucket, ents[2]);
+ ASSERT_EQ(entry.status, lc_uninitial);
+ ASSERT_EQ(entry.start_time, lc_time);
// update entry4 to entry5
entry4.status = lc_complete;
ret = db->set_entry(index2, entry4);
ASSERT_EQ(ret, 0);
- ret = db->get_entry(index2, ents[3], &entry);
+ ret = db->get_entry(index2, ents[3], entry);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(entry->get_status(), lc_complete);
+ ASSERT_EQ(entry.status, lc_complete);
// list entries
ret = db->list_entries(index1, "", 5, lc_entries);
ASSERT_EQ(ret, 0);
for (const auto& ent: lc_entries) {
cout << "###################### \n";
- cout << "lc entry.bucket : " << ent->get_bucket() << "\n";
- cout << "lc entry.status : " << ent->get_status() << "\n";
+ cout << "lc entry.bucket : " << ent.bucket << "\n";
+ cout << "lc entry.status : " << ent.status << "\n";
}
// remove index1, entry3
ret = db->rm_entry(index1, entry3);
ASSERT_EQ(ret, 0);
- // get next entry index1, entry2.. should be null
- entry.release();
- ret = db->get_next_entry(index1, ents[1], &entry);
+ // get next entry index1, entry2.. should be empty
+ entry = rgw::sal::LCEntry{};
+ ret = db->get_next_entry(index1, ents[1], entry);
ASSERT_EQ(ret, 0);
- ASSERT_EQ(entry.get(), nullptr);
+ ASSERT_TRUE(entry.bucket.empty());
}
TEST_F(DBStoreTest, RemoveBucket) {
diff --git a/src/rgw/driver/motr/rgw_sal_motr.cc b/src/rgw/driver/motr/rgw_sal_motr.cc
index bb3af3a63a1..b999673ac18 100644
--- a/src/rgw/driver/motr/rgw_sal_motr.cc
+++ b/src/rgw/driver/motr/rgw_sal_motr.cc
@@ -585,7 +585,7 @@ int MotrBucket::remove(const DoutPrefixProvider *dpp, bool delete_children, opti
std::unique_ptr<rgw::sal::Object> object = get_object(key);
- ret = object->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP);
+ ret = object->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
if (ret < 0 && ret != -ENOENT) {
ldpp_dout(dpp, 0) << "ERROR: remove_bucket rgw_remove_object failed rc=" << ret << dendl;
return ret;
@@ -1502,7 +1502,11 @@ int MotrObject::MotrDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional
return 0;
}
-int MotrObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, uint32_t flags)
+int MotrObject::delete_object(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv)
{
MotrObject::MotrDeleteOp del_op(this);
del_op.params.bucket_owner = bucket->get_info().owner;
@@ -2668,7 +2672,8 @@ int MotrMultipartUpload::complete(const DoutPrefixProvider *dpp,
RGWCompressionInfo& cs_info, off_t& off,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj)
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes)
{
char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -2878,6 +2883,15 @@ int MotrMultipartUpload::complete(const DoutPrefixProvider *dpp,
M0_IC_DEL, meta_obj->get_key().get_oid(), bl);
}
+int MotrMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes)
+{
+ return -ENOTSUP;
+}
+
int MotrMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
{
if (!rule && !attrs) {
diff --git a/src/rgw/driver/motr/rgw_sal_motr.h b/src/rgw/driver/motr/rgw_sal_motr.h
index 4d9c189993b..f92074b9d94 100644
--- a/src/rgw/driver/motr/rgw_sal_motr.h
+++ b/src/rgw/driver/motr/rgw_sal_motr.h
@@ -657,7 +657,9 @@ class MotrObject : public StoreObject {
virtual int delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags) override;
+ uint32_t flags,
+ td::list<rgw_obj_index_key>* remove_objs,
+ GWObjVersionTracker* objv) override;
virtual int copy_object(const ACLOwner& owner,
const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
@@ -933,7 +935,13 @@ public:
RGWCompressionInfo& cs_info, off_t& off,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj) override;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) override;
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) override;
virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
optional_yield y,
diff --git a/src/rgw/driver/posix/rgw_sal_posix.cc b/src/rgw/driver/posix/rgw_sal_posix.cc
index 8ffe4a0d0ca..d676213727e 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.cc
+++ b/src/rgw/driver/posix/rgw_sal_posix.cc
@@ -2730,7 +2730,9 @@ int POSIXBucket::rename(const DoutPrefixProvider* dpp, optional_yield y, Object*
int POSIXObject::delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags)
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv)
{
POSIXBucket *b = static_cast<POSIXBucket*>(get_bucket());
if (!b) {
@@ -3505,7 +3507,7 @@ int POSIXObject::POSIXReadOp::get_attr(const DoutPrefixProvider* dpp, const char
int POSIXObject::POSIXDeleteOp::delete_obj(const DoutPrefixProvider* dpp,
optional_yield y, uint32_t flags)
{
- return source->delete_object(dpp, y, flags);
+ return source->delete_object(dpp, y, flags, nullptr, nullptr);
}
int POSIXObject::copy(const DoutPrefixProvider *dpp, optional_yield y,
@@ -3703,7 +3705,8 @@ int POSIXMultipartUpload::complete(const DoutPrefixProvider *dpp,
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj)
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes)
{
char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -3852,6 +3855,15 @@ int POSIXMultipartUpload::complete(const DoutPrefixProvider *dpp,
return 0;
}
+int POSIXMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes)
+{
+ return -ENOTSUP;
+}
+
int POSIXMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y,
rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
{
diff --git a/src/rgw/driver/posix/rgw_sal_posix.h b/src/rgw/driver/posix/rgw_sal_posix.h
index 5291ba500f9..efe3bfd7a50 100644
--- a/src/rgw/driver/posix/rgw_sal_posix.h
+++ b/src/rgw/driver/posix/rgw_sal_posix.h
@@ -632,7 +632,9 @@ public:
virtual int delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags) override;
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) override;
virtual int copy_object(const ACLOwner& owner,
const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
@@ -879,7 +881,13 @@ public:
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj) override;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) override;
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) override;
virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y,
rgw_placement_rule** rule, rgw::sal::Attrs* attrs) override;
diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc
index f486b0405af..6bb1e9572da 100644
--- a/src/rgw/driver/rados/rgw_bucket.cc
+++ b/src/rgw/driver/rados/rgw_bucket.cc
@@ -149,7 +149,7 @@ int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, r
std::unique_ptr<rgw::sal::Object> object = bucket->get_object(key);
- return object->delete_object(dpp, y, rgw::sal::FLAG_LOG_OP);
+ return object->delete_object(dpp, y, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
}
static void set_err_msg(std::string *sink, std::string msg)
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
index 6b1dd9651f0..6fdc4b91677 100644
--- a/src/rgw/driver/rados/rgw_notify.cc
+++ b/src/rgw/driver/rados/rgw_notify.cc
@@ -60,22 +60,19 @@ auto make_stack_allocator() {
const std::string Q_LIST_OBJECT_NAME = "queues_list_object";
struct PublishCommitCompleteArg {
+ PublishCommitCompleteArg(const std::string& _queue_name, CephContext* _cct)
+ : queue_name{_queue_name}, cct{_cct} {}
- PublishCommitCompleteArg(std::string _queue_name, const DoutPrefixProvider *_dpp)
- : queue_name{std::move(_queue_name)}, dpp{_dpp} {}
-
- std::string queue_name;
- const DoutPrefixProvider *dpp;
+ const std::string queue_name;
+ CephContext* const cct;
};
-void publish_commit_completion(rados_completion_t completion, void *arg) {
- auto *comp_obj = reinterpret_cast<librados::AioCompletionImpl *>(completion);
- std::unique_ptr<PublishCommitCompleteArg> pcc_arg(reinterpret_cast<PublishCommitCompleteArg *>(arg));
- if (comp_obj->get_return_value() < 0) {
- ldpp_dout(pcc_arg->dpp, 1) << "ERROR: failed to commit reservation to queue: "
- << pcc_arg->queue_name << ". error: " << comp_obj->get_return_value()
- << dendl;
- }
+void publish_commit_completion(rados_completion_t completion, void* arg) {
+ std::unique_ptr<PublishCommitCompleteArg> pcc_args{reinterpret_cast<PublishCommitCompleteArg*>(arg)};
+ if (const auto rc = rados_aio_get_return_value(completion); rc < 0) {
+ ldout(pcc_args->cct, 1) << "ERROR: failed to commit reservation to queue: "
+ << pcc_args->queue_name << ". error: " << rc << dendl;
+ }
};
class Manager : public DoutPrefixProvider {
@@ -1243,19 +1240,17 @@ int publish_commit(rgw::sal::Object* obj,
std::vector<buffer::list> bl_data_vec{std::move(bl)};
librados::ObjectWriteOperation op;
cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
- aio_completion_ptr completion {librados::Rados::aio_create_completion()};
- auto pcc_arg = make_unique<PublishCommitCompleteArg>(queue_name, dpp);
- completion->set_complete_callback(pcc_arg.get(), publish_commit_completion);
- auto &io_ctx = res.store->getRados()->get_notif_pool_ctx();
- int ret = io_ctx.aio_operate(queue_name, completion.get(), &op);
topic.res_id = cls_2pc_reservation::NO_ID;
- if (ret < 0) {
+ auto pcc_arg = make_unique<PublishCommitCompleteArg>(queue_name, dpp->get_cct());
+ aio_completion_ptr completion{librados::Rados::aio_create_completion(pcc_arg.get(), publish_commit_completion)};
+ auto& io_ctx = res.store->getRados()->get_notif_pool_ctx();
+ if (const int ret = io_ctx.aio_operate(queue_name, completion.get(), &op); ret < 0) {
ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
<< queue_name << ". error: " << ret << dendl;
return ret;
}
+ // args will be released inside the callback
pcc_arg.release();
- completion.release();
} else {
try {
// TODO add endpoint LRU cache
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
index c285443d0b0..a5d788ea469 100644
--- a/src/rgw/driver/rados/rgw_object_expirer_core.cc
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc
@@ -225,7 +225,7 @@ int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objex
std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
obj->set_atomic();
- ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP);
+ ret = obj->delete_object(dpp, null_yield, rgw::sal::FLAG_LOG_OP, nullptr, nullptr);
return ret;
}
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc
index 79be7b3209b..f04ed1db8d4 100644
--- a/src/rgw/driver/rados/rgw_putobj_processor.cc
+++ b/src/rgw/driver/rados/rgw_putobj_processor.cc
@@ -22,6 +22,8 @@
#include "services/svc_zone.h"
#include "rgw_sal_rados.h"
+#include "cls/version/cls_version_client.h"
+
#define dout_subsys ceph_subsys_rgw
using namespace std;
@@ -573,7 +575,9 @@ int MultipartObjectProcessor::complete(
}
librados::ObjectWriteOperation op;
+ op.assert_exists();
cls_rgw_mp_upload_part_info_update(op, p, info);
+ cls_version_inc(op);
r = rgw_rados_operate(rctx.dpp, meta_obj_ref.ioctx, meta_obj_ref.obj.oid, &op, rctx.y);
ldpp_dout(rctx.dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl;
@@ -588,8 +592,10 @@ int MultipartObjectProcessor::complete(
op = librados::ObjectWriteOperation{};
op.assert_exists(); // detect races with abort
op.omap_set(m);
+ cls_version_inc(op);
r = rgw_rados_operate(rctx.dpp, meta_obj_ref.ioctx, meta_obj_ref.obj.oid, &op, rctx.y);
}
+
if (r < 0) {
return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
}
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index 1f1d75cb07b..a967030f622 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -5857,6 +5857,10 @@ int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvi
store->remove_rgw_head_obj(op);
+ if (params.check_objv != nullptr) {
+ cls_version_check(op, *params.check_objv, VER_COND_EQ);
+ }
+
auto& ioctx = ref.ioctx;
r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
@@ -6051,7 +6055,7 @@ int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *oc
int r = -ENOENT;
if (!assume_noent) {
- r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
+ r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), &s->objv_tracker, y);
}
if (r == -ENOENT) {
@@ -6784,6 +6788,10 @@ int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *
return -ENOENT;
}
+ if (params.objv_tracker) {
+ *params.objv_tracker = astate->objv_tracker;
+ }
+
RGWBucketInfo& bucket_info = source->get_bucket_info();
if (params.part_num) {
@@ -8781,6 +8789,7 @@ int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
if (first_chunk) {
op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
}
+
bufferlist outbl;
r = rgw_rados_operate(dpp, ref.ioctx, ref.obj.oid, &op, &outbl, y);
@@ -9388,13 +9397,6 @@ int RGWRados::process_gc(bool expired_only, optional_yield y)
return gc->process(expired_only, y);
}
-int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
- int& index)
-{
- return lc->list_lc_progress(marker, max_entries, progress_map, index);
-}
-
int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
{
RGWLC lc;
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
index d4aabe7ba76..9d520c12a1e 100644
--- a/src/rgw/driver/rados/rgw_rados.h
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -771,6 +771,7 @@ public:
uint64_t *epoch;
int* part_num = nullptr;
std::optional<int> parts_count;
+ RGWObjVersionTracker *objv_tracker = nullptr;
Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
target_obj(nullptr), epoch(nullptr)
@@ -854,8 +855,9 @@ public:
rgw_zone_set *zones_trace;
bool abortmp;
uint64_t parts_accounted_size;
+ obj_version *check_objv;
- DeleteParams() : versioning_status(0), null_verid(false), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+ DeleteParams() : versioning_status(0), null_verid(false), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0), check_objv(nullptr) {}
} params;
struct DeleteResult {
@@ -1571,9 +1573,6 @@ public:
int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y);
int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
- int list_lc_progress(std::string& marker, uint32_t max_entries,
- std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
- int& index);
int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
index 50c738d5435..e3b435d2839 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.cc
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -2318,6 +2318,7 @@ int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Rea
read_op.params.target_obj = target_obj;
read_op.params.obj_size = &state.size;
read_op.params.lastmod = &state.mtime;
+ read_op.params.objv_tracker = &state.objv_tracker;
return read_op.prepare(y, dpp);
}
@@ -2848,6 +2849,9 @@ int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, option
parent_op.params.abortmp = params.abortmp;
parent_op.params.parts_accounted_size = params.parts_accounted_size;
parent_op.params.null_verid = params.null_verid;
+ if (params.objv_tracker) {
+ parent_op.params.check_objv = params.objv_tracker->version_for_check();
+ }
int ret = parent_op.delete_obj(y, dpp, flags & FLAG_LOG_OP);
if (ret < 0)
@@ -2861,7 +2865,9 @@ int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, option
int RadosObject::delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags)
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv)
{
RGWRados::Object del_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
RGWRados::Object::Delete del_op(&del_target);
@@ -2869,6 +2875,10 @@ int RadosObject::delete_object(const DoutPrefixProvider* dpp,
del_op.params.bucket_owner = bucket->get_info().owner;
del_op.params.versioning_status = (flags & FLAG_PREVENT_VERSIONING)
? 0 : bucket->get_info().versioning_status();
+ del_op.params.remove_objs = remove_objs;
+ if (objv) {
+ del_op.params.check_objv = objv->version_for_check();
+ }
return del_op.delete_obj(y, dpp, flags & FLAG_LOG_OP);
}
@@ -2964,13 +2974,84 @@ int RadosObject::swift_versioning_copy(const ACLOwner& owner, const rgw_user& re
y);
}
+int RadosMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes)
+{
+ bool truncated;
+ int ret;
+ int max_parts = 1000;
+ int marker = 0;
+ cls_rgw_obj_chain chain;
+
+ do {
+ ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated, y);
+
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " << ret << dendl;
+ return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+ }
+
+ for (auto part_it = parts.begin(); part_it != parts.end(); ++part_it) {
+ RadosMultipartPart* part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+
+ auto& part_prefixes = processed_prefixes[part->info.num];
+
+ if (!part->info.manifest.empty()) {
+ auto manifest_prefix = part->info.manifest.get_prefix();
+ if (not manifest_prefix.empty() && part_prefixes.find(manifest_prefix) == part_prefixes.end()) {
+ store->getRados()->update_gc_chain(dpp, obj, part->info.manifest, &chain);
+
+ RGWObjManifest::obj_iterator oiter = part->info.manifest.obj_begin(dpp);
+ if (oiter != part->info.manifest.obj_end(dpp)) {
+ rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
+
+ rgw_obj head_obj;
+ RGWSI_Tier_RADOS::raw_obj_to_obj(bucket->get_key(), raw_head, &head_obj);
+
+ rgw_obj_index_key remove_key;
+ head_obj.key.get_index_key(&remove_key);
+ remove_objs.push_back(remove_key);
+ }
+ }
+ }
+ cleanup_part_history(dpp, y, part, remove_objs, part_prefixes);
+ }
+ } while (truncated);
+
+ if (store->getRados()->get_gc() == nullptr) {
+ //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+ store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id(), y);
+ } else {
+ /* use upload id as tag and do it synchronously */
+ auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id(), y);
+ if (ret < 0 && leftover_chain) {
+ ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+ if (ret == -ENOENT) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+ //Delete objects inline if send chain to gc fails
+ store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id(), y);
+ }
+ }
+ return 0;
+}
+
int RadosMultipartUpload::cleanup_part_history(const DoutPrefixProvider* dpp,
optional_yield y,
RadosMultipartPart *part,
- list<rgw_obj_index_key>& remove_objs)
+ list<rgw_obj_index_key>& remove_objs,
+ boost::container::flat_set<std::string>& processed_prefixes)
{
cls_rgw_obj_chain chain;
for (auto& ppfx : part->get_past_prefixes()) {
+ auto [it, inserted] = processed_prefixes.emplace(ppfx);
+ if (!inserted) {
+ continue; // duplicate
+ }
+
rgw_obj past_obj;
past_obj.init_ns(bucket->get_key(), ppfx + "." + std::to_string(part->info.num), mp_ns);
rgw_obj_index_key past_key;
@@ -3018,77 +3099,105 @@ int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct,
int ret;
uint64_t parts_accounted_size = 0;
- do {
- ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated, y);
+ prefix_map_t processed_prefixes;
+
+ static constexpr auto MAX_DELETE_RETRIES = 15u;
+ for (auto i = 0u; i < MAX_DELETE_RETRIES; i++) {
+ ret = meta_obj->get_obj_attrs(y, dpp);
if (ret < 0) {
- ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " <<
- ret << dendl;
+ ldpp_dout(dpp, 0) << __func__ << ": ERROR: failed to get obj attrs, obj=" << meta_obj
+ << " ret=" << ret << dendl;
return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
}
- for (auto part_it = parts.begin();
- part_it != parts.end();
- ++part_it) {
- RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
- if (obj_part->info.manifest.empty()) {
- std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
- rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
- obj->set_hash_source(mp_obj.get_key());
- ret = obj->delete_object(dpp, y, 0);
- if (ret < 0 && ret != -ENOENT)
- return ret;
- } else {
- auto target = meta_obj->get_obj();
- store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
- RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
- if (oiter != obj_part->info.manifest.obj_end(dpp)) {
- std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
- rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
- dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
-
- rgw_obj_index_key key;
- head->get_key().get_index_key(&key);
- remove_objs.push_back(key);
-
- cleanup_part_history(dpp, null_yield, obj_part, remove_objs);
+ RGWObjVersionTracker objv_tracker = meta_obj->get_version_tracker();
+
+ do {
+ ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated, y);
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " << ret << dendl;
+ return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+ }
+
+ for (auto part_it = parts.begin(); part_it != parts.end(); ++part_it) {
+ RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+
+ if (obj_part->info.manifest.empty()) {
+ std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+ rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
+ obj->set_hash_source(mp_obj.get_key());
+ ret = obj->delete_object(dpp, y, 0, nullptr, nullptr);
+ if (ret < 0 && ret != -ENOENT)
+ return ret;
+ } else {
+ auto manifest_prefix = obj_part->info.manifest.get_prefix();
+ auto [it, inserted] = processed_prefixes.emplace(obj_part->info.num, boost::container::flat_set<std::string>{});
+ if (not manifest_prefix.empty()) {
+ if (it->second.find(manifest_prefix) != it->second.end()) {
+ continue;
+ }
+ it->second.emplace(manifest_prefix);
+ }
+
+ auto target = meta_obj->get_obj();
+ store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
+ RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
+ if (oiter != obj_part->info.manifest.obj_end(dpp)) {
+ std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
+ rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
+ dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
+
+ rgw_obj_index_key key;
+ head->get_key().get_index_key(&key);
+ remove_objs.push_back(key);
+
+ cleanup_part_history(dpp, null_yield, obj_part, remove_objs, it->second);
+ }
}
+ parts_accounted_size += obj_part->info.accounted_size;
}
- parts_accounted_size += obj_part->info.accounted_size;
- }
- } while (truncated);
+ } while (truncated);
- if (store->getRados()->get_gc() == nullptr) {
- //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
- store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id(), y);
- } else {
- /* use upload id as tag and do it synchronously */
- auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id(), y);
- if (ret < 0 && leftover_chain) {
- ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
- if (ret == -ENOENT) {
- return -ERR_NO_SUCH_UPLOAD;
+ if (store->getRados()->get_gc() == nullptr) {
+ //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+ store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id(), y);
+ } else {
+ /* use upload id as tag and do it synchronously */
+ auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id(), y);
+ if (ret < 0 && leftover_chain) {
+ ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+ if (ret == -ENOENT) {
+ return -ERR_NO_SUCH_UPLOAD;
+ }
+ //Delete objects inline if send chain to gc fails
+ store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id(), y);
}
- //Delete objects inline if send chain to gc fails
- store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id(), y);
}
- }
- std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
- del_op->params.bucket_owner = bucket->get_info().owner;
- del_op->params.versioning_status = 0;
- if (!remove_objs.empty()) {
- del_op->params.remove_objs = &remove_objs;
- }
+ std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
+ del_op->params.bucket_owner = bucket->get_info().owner;
+ del_op->params.versioning_status = 0;
+ if (!remove_objs.empty()) {
+ del_op->params.remove_objs = &remove_objs;
+ }
- del_op->params.abortmp = true;
- del_op->params.parts_accounted_size = parts_accounted_size;
+ del_op->params.abortmp = true;
+ del_op->params.parts_accounted_size = parts_accounted_size;
+ del_op->params.objv_tracker = &objv_tracker;
- // and also remove the metadata obj
- ret = del_op->delete_obj(dpp, y, 0);
- if (ret < 0) {
- ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
- ret << dendl;
+ // and also remove the metadata obj
+ ret = del_op->delete_obj(dpp, y, 0);
+ if (ret != -ECANCELED) {
+ if (ret < 0) {
+ ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " << ret << dendl;
+ }
+ break;
+ }
+ ldpp_dout(dpp, 20) << "deleting meta_obj is cancelled due to mismatch cls_version: " << objv_tracker << dendl;
+ chain.objs.clear();
+ marker = 0;
}
+
return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
}
@@ -3269,7 +3378,8 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj)
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes)
{
char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -3341,6 +3451,8 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
rgw_obj src_obj;
src_obj.init_ns(bucket->get_key(), oid, mp_ns);
+ auto [it, inserted] = processed_prefixes.emplace(part->info.num, boost::container::flat_set<std::string>{});
+
if (obj_part.manifest.empty()) {
ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
<< src_obj << dendl;
@@ -3352,6 +3464,7 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
if (not manifest_prefix.empty()) {
// It has an explicit prefix. Override the default one.
src_obj.init_ns(bucket->get_key(), manifest_prefix + "." + std::to_string(part->info.num), mp_ns);
+ it->second.emplace(manifest_prefix);
}
}
@@ -3397,7 +3510,7 @@ int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
remove_objs.push_back(remove_key);
- cleanup_part_history(dpp, y, part, remove_objs);
+ cleanup_part_history(dpp, y, part, remove_objs, it->second);
ofs += obj_part.size;
accounted_size += obj_part.accounted_size;
@@ -3614,7 +3727,7 @@ int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, opti
int RadosLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry)
+ LCEntry& entry)
{
librados::ObjectReadOperation op;
bufferlist bl;
@@ -3632,13 +3745,15 @@ int RadosLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
return ret;
}
- *entry = std::make_unique<StoreLCEntry>(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+ entry.bucket = std::move(cls_entry.bucket);
+ entry.start_time = cls_entry.start_time;
+ entry.status = cls_entry.status;
return 0;
}
int RadosLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry)
+ LCEntry& entry)
{
librados::ObjectReadOperation op;
bufferlist bl;
@@ -3656,18 +3771,20 @@ int RadosLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield
return ret;
}
- *entry = std::make_unique<StoreLCEntry>(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+ entry.bucket = std::move(cls_entry.bucket);
+ entry.start_time = cls_entry.start_time;
+ entry.status = cls_entry.status;
return 0;
}
int RadosLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry)
+ const std::string& oid, const LCEntry& entry)
{
cls_rgw_lc_entry cls_entry;
- cls_entry.bucket = entry.get_bucket();
- cls_entry.start_time = entry.get_start_time();
- cls_entry.status = entry.get_status();
+ cls_entry.bucket = entry.bucket;
+ cls_entry.start_time = entry.start_time;
+ cls_entry.status = entry.status;
librados::ObjectWriteOperation op;
cls_rgw_lc_set_entry(op, cls_entry);
@@ -3678,7 +3795,7 @@ int RadosLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
int RadosLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- uint32_t max_entries, std::vector<std::unique_ptr<LCEntry>>& entries)
+ uint32_t max_entries, std::vector<LCEntry>& entries)
{
entries.clear();
@@ -3699,20 +3816,19 @@ int RadosLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y
}
for (auto& entry : cls_entries) {
- entries.push_back(std::make_unique<StoreLCEntry>(entry.bucket, oid,
- entry.start_time, entry.status));
+ entries.push_back(LCEntry{entry.bucket, entry.start_time, entry.status});
}
return ret;
}
int RadosLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry)
+ const std::string& oid, const LCEntry& entry)
{
cls_rgw_lc_entry cls_entry;
- cls_entry.bucket = entry.get_bucket();
- cls_entry.start_time = entry.get_start_time();
- cls_entry.status = entry.get_status();
+ cls_entry.bucket = entry.bucket;
+ cls_entry.start_time = entry.start_time;
+ cls_entry.status = entry.status;
librados::ObjectWriteOperation op;
cls_rgw_lc_rm_entry(op, cls_entry);
@@ -3722,7 +3838,7 @@ int RadosLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
}
int RadosLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head)
+ const std::string& oid, LCHead& head)
{
librados::ObjectReadOperation op;
bufferlist bl;
@@ -3740,18 +3856,20 @@ int RadosLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
return ret;
}
- *head = std::make_unique<StoreLCHead>(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker);
+ head.start_date = cls_head.start_date;
+ head.shard_rollover_date = cls_head.shard_rollover_date;
+ head.marker = std::move(cls_head.marker);
return 0;
}
int RadosLifecycle::put_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCHead& head)
+ const std::string& oid, const LCHead& head)
{
cls_rgw_lc_obj_head cls_head;
- cls_head.marker = head.get_marker();
- cls_head.start_date = head.get_start_date();
- cls_head.shard_rollover_date = head.get_shard_rollover_date();
+ cls_head.marker = head.marker;
+ cls_head.start_date = head.start_date;
+ cls_head.shard_rollover_date = head.shard_rollover_date;
librados::ObjectWriteOperation op;
cls_rgw_lc_put_head(op, cls_head);
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
index 98d0bc9d005..d359f733ab3 100644
--- a/src/rgw/driver/rados/rgw_sal_rados.h
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -559,7 +559,9 @@ class RadosObject : public StoreObject {
rados_ctx->invalidate(get_obj());
}
virtual int delete_object(const DoutPrefixProvider* dpp,
- optional_yield y, uint32_t flags) override;
+ optional_yield y, uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) override;
virtual int copy_object(const ACLOwner& owner,
const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
@@ -825,7 +827,13 @@ public:
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj) override;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) override;
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) override;
virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
optional_yield y,
@@ -838,7 +846,8 @@ protected:
int cleanup_part_history(const DoutPrefixProvider* dpp,
optional_yield y,
RadosMultipartPart* part,
- std::list<rgw_obj_index_key>& remove_objs);
+ std::list<rgw_obj_index_key>& remove_objs,
+ boost::container::flat_set<std::string>& processed_prefixes);
};
class MPRadosSerializer : public StoreMPSerializer {
@@ -868,31 +877,30 @@ public:
}
};
-class RadosLifecycle : public StoreLifecycle {
+class RadosLifecycle : public Lifecycle {
RadosStore* store;
public:
RadosLifecycle(RadosStore* _st) : store(_st) {}
- using StoreLifecycle::get_entry;
virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) override;
+ LCEntry& entry) override;
virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) override;
+ LCEntry& entry) override;
virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) override;
+ const std::string& oid, const LCEntry& entry) override;
virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
uint32_t max_entries,
- std::vector<std::unique_ptr<LCEntry>>& entries) override;
+ std::vector<LCEntry>& entries) override;
virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) override;
+ const std::string& oid, const LCEntry& entry) override;
virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head) override;
- virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, LCHead& head) override;
+ virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
+ const std::string& oid, const LCHead& head) override;
virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
const std::string& oid,
const std::string& cookie) override;
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 3fe5f686bf0..966e912b05d 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -8835,16 +8835,16 @@ next:
if (opt_cmd == OPT::LC_LIST) {
formatter->open_array_section("lifecycle_list");
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> bucket_lc_map;
+ vector<rgw::sal::LCEntry> bucket_lc_map;
string marker;
int index{0};
#define MAX_LC_LIST_ENTRIES 100
if (max_entries < 0) {
max_entries = MAX_LC_LIST_ENTRIES;
}
+ RGWLC* lc = driver->get_rgwlc();
do {
- int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->list_lc_progress(marker, max_entries,
- bucket_lc_map, index);
+ int ret = lc->list_lc_progress(marker, max_entries, bucket_lc_map, index);
if (ret < 0) {
cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret)
<< std::endl;
@@ -8852,17 +8852,15 @@ next:
}
for (const auto& entry : bucket_lc_map) {
formatter->open_object_section("bucket_lc_info");
- formatter->dump_string("bucket", entry->get_bucket());
- formatter->dump_string("shard", entry->get_oid());
+ formatter->dump_string("bucket", entry.bucket);
char exp_buf[100];
- time_t t{time_t(entry->get_start_time())};
+ time_t t = entry.start_time;
if (std::strftime(
exp_buf, sizeof(exp_buf),
"%a, %d %b %Y %T %Z", std::gmtime(&t))) {
formatter->dump_string("started", exp_buf);
}
- string lc_status = LC_STATUS[entry->get_status()];
- formatter->dump_string("status", lc_status);
+ formatter->dump_string("status", LC_STATUS[entry.status]);
formatter->close_section(); // objs
formatter->flush(cout);
}
diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc
index 1a28167c406..f80e40c70b6 100644
--- a/src/rgw/rgw_asio_frontend.cc
+++ b/src/rgw/rgw_asio_frontend.cc
@@ -3,9 +3,13 @@
#include <atomic>
#include <ctime>
+#include <list>
#include <memory>
-#include <vector>
+#include <boost/asio/bind_executor.hpp>
+#include <boost/asio/bind_cancellation_slot.hpp>
+#include <boost/asio/cancellation_signal.hpp>
+#include <boost/asio/detached.hpp>
#include <boost/asio/error.hpp>
#include <boost/asio/io_context.hpp>
#include <boost/asio/ip/tcp.hpp>
@@ -64,6 +68,44 @@ auto make_stack_allocator() {
return boost::context::protected_fixedsize_stack{512*1024};
}
+static constexpr std::chrono::milliseconds BACKOFF_MAX_WAIT(5000);
+
+class RGWAsioBackoff {
+ using Clock = ceph::coarse_mono_clock;
+ using Timer = boost::asio::basic_waitable_timer<Clock>;
+ Timer timer;
+
+ ceph::timespan cur_wait;
+ void update_wait_time();
+public:
+ explicit RGWAsioBackoff(boost::asio::io_context& context) :
+ timer(context),
+ cur_wait(std::chrono::milliseconds(1)) {
+ }
+
+ void backoff_sleep(boost::asio::yield_context yield);
+ void reset() {
+ cur_wait = std::chrono::milliseconds(1);
+ }
+};
+
+void RGWAsioBackoff::update_wait_time()
+{
+ if (cur_wait < BACKOFF_MAX_WAIT) {
+ cur_wait = cur_wait * 2;
+ }
+ if (cur_wait > BACKOFF_MAX_WAIT) {
+ cur_wait = BACKOFF_MAX_WAIT;
+ }
+}
+
+void RGWAsioBackoff::backoff_sleep(boost::asio::yield_context yield)
+{
+ update_wait_time();
+ timer.expires_after(cur_wait);
+ timer.async_wait(yield);
+}
+
using namespace std;
template <typename Stream>
@@ -424,29 +466,34 @@ class AsioFrontend {
tcp::endpoint endpoint;
tcp::acceptor acceptor;
tcp::socket socket;
+ boost::asio::cancellation_signal signal;
bool use_ssl = false;
bool use_nodelay = false;
explicit Listener(boost::asio::io_context& context)
: acceptor(context), socket(context) {}
};
- std::vector<Listener> listeners;
+ std::list<Listener> listeners;
ConnectionList connections;
std::atomic<bool> going_down{false};
+ RGWAsioBackoff backoff;
CephContext* ctx() const { return cct.get(); }
std::optional<dmc::ClientCounters> client_counters;
std::unique_ptr<dmc::ClientConfig> client_config;
- void accept(Listener& listener, boost::system::error_code ec);
+
+ void accept(Listener& listener, boost::asio::yield_context yield);
+ void on_accept(Listener& listener, tcp::socket stream);
public:
AsioFrontend(RGWProcessEnv& env, RGWFrontendConfig* conf,
dmc::SchedulerCtx& sched_ctx,
boost::asio::io_context& context)
: env(env), conf(conf), context(context),
- pause_mutex(context.get_executor())
+ pause_mutex(context.get_executor()),
+ backoff(context)
{
auto sched_t = dmc::get_scheduler_t(ctx());
switch(sched_t){
@@ -683,10 +730,13 @@ int AsioFrontend::init()
}
}
l.acceptor.listen(max_connection_backlog);
- l.acceptor.async_accept(l.socket,
- [this, &l] (boost::system::error_code ec) {
- accept(l, ec);
- });
+
+ // spawn a cancellable coroutine to the run the accept loop
+ boost::asio::spawn(context,
+ [this, &l] (boost::asio::yield_context yield) mutable {
+ accept(l, yield);
+ }, bind_cancellation_slot(l.signal.slot(),
+ bind_executor(context, boost::asio::detached)));
ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl;
socket_bound = true;
@@ -1003,22 +1053,39 @@ int AsioFrontend::init_ssl()
}
#endif // WITH_RADOSGW_BEAST_OPENSSL
-void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
+void AsioFrontend::accept(Listener& l, boost::asio::yield_context yield)
{
- if (!l.acceptor.is_open()) {
- return;
- } else if (ec == boost::asio::error::operation_aborted) {
- return;
- } else if (ec) {
- ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl;
- return;
+ for (;;) {
+ boost::system::error_code ec;
+ l.acceptor.async_accept(l.socket, yield[ec]);
+
+ if (!l.acceptor.is_open()) {
+ return;
+ } else if (ec == boost::asio::error::operation_aborted) {
+ return;
+ } else if (ec) {
+ ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl;
+ if (ec == boost::system::errc::too_many_files_open ||
+ ec == boost::system::errc::too_many_files_open_in_system ||
+ ec == boost::system::errc::no_buffer_space ||
+ ec == boost::system::errc::not_enough_memory) {
+ // always retry accept() if we hit a resource limit
+ backoff.backoff_sleep(yield);
+ continue;
+ }
+ ldout(ctx(), 0) << "accept stopped due to error: " << ec.message() << dendl;
+ return;
+ }
+
+ backoff.reset();
+ on_accept(l, std::move(l.socket));
}
- auto stream = std::move(l.socket);
+}
+
+void AsioFrontend::on_accept(Listener& l, tcp::socket stream)
+{
+ boost::system::error_code ec;
stream.set_option(tcp::no_delay(l.use_nodelay), ec);
- l.acceptor.async_accept(l.socket,
- [this, &l] (boost::system::error_code ec) {
- accept(l, ec);
- });
// spawn a coroutine to handle the connection
#ifdef WITH_RADOSGW_BEAST_OPENSSL
@@ -1086,6 +1153,8 @@ void AsioFrontend::stop()
// close all listeners
for (auto& listener : listeners) {
listener.acceptor.close(ec);
+ // signal cancellation of accept()
+ listener.signal.emit(boost::asio::cancellation_type::terminal);
}
// close all connections
connections.close(ec);
@@ -1107,6 +1176,8 @@ void AsioFrontend::pause()
boost::system::error_code ec;
for (auto& l : listeners) {
l.acceptor.cancel(ec);
+ // signal cancellation of accept()
+ l.signal.emit(boost::asio::cancellation_type::terminal);
}
// close all connections so outstanding requests fail quickly
@@ -1129,10 +1200,12 @@ void AsioFrontend::unpause()
// start accepting connections again
for (auto& l : listeners) {
- l.acceptor.async_accept(l.socket,
- [this, &l] (boost::system::error_code ec) {
- accept(l, ec);
- });
+ boost::asio::spawn(context,
+ [this, &l] (boost::asio::yield_context yield) mutable {
+ accept(l, yield);
+ }, bind_cancellation_slot(l.signal.slot(),
+ bind_executor(context, boost::asio::detached)));
+
}
ldout(ctx(), 4) << "frontend unpaused" << dendl;
diff --git a/src/rgw/rgw_kafka.cc b/src/rgw/rgw_kafka.cc
index d3586e4900b..c0ec3dc2c55 100644
--- a/src/rgw/rgw_kafka.cc
+++ b/src/rgw/rgw_kafka.cc
@@ -19,11 +19,6 @@
#define dout_subsys ceph_subsys_rgw_notification
-// comparison operator between topic pointer and name
-bool operator==(const rd_kafka_topic_t* rkt, const std::string& name) {
- return name == std::string_view(rd_kafka_topic_name(rkt));
-}
-
// this is the inverse of rd_kafka_errno2err
// see: https://github.com/confluentinc/librdkafka/blob/master/src/rdkafka.c
inline int rd_kafka_err2errno(rd_kafka_resp_err_t err) {
@@ -141,9 +136,19 @@ struct reply_callback_with_tag_t {
typedef std::vector<reply_callback_with_tag_t> CallbackList;
+
+
struct connection_t {
rd_kafka_t* producer = nullptr;
- std::vector<rd_kafka_topic_t*> topics;
+
+ struct rd_kafka_topic_deleter {
+ void operator()(rd_kafka_topic_t* topic) {
+ rd_kafka_topic_destroy(topic);
+ }
+ };
+ using topic_ptr = std::unique_ptr<rd_kafka_topic_t, rd_kafka_topic_deleter>;
+ std::map<std::string, topic_ptr> topics;
+
uint64_t delivery_tag = 1;
int status = 0;
CephContext* const cct;
@@ -168,7 +173,6 @@ struct connection_t {
// wait for 500ms to try and handle pending callbacks
rd_kafka_flush(producer, 500);
// destroy all topics
- std::for_each(topics.begin(), topics.end(), [](auto topic) {rd_kafka_topic_destroy(topic);});
topics.clear();
// destroy producer
rd_kafka_destroy(producer);
@@ -431,10 +435,9 @@ private:
}
// create a new topic unless it was already created
- auto topic_it = std::find(conn->topics.begin(), conn->topics.end(), message->topic);
- rd_kafka_topic_t* topic = nullptr;
+ auto topic_it = conn->topics.find(message->topic);
if (topic_it == conn->topics.end()) {
- topic = rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr);
+ connection_t::topic_ptr topic(rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr));
if (!topic) {
const auto err = rd_kafka_last_error();
ldout(conn->cct, 1) << "Kafka publish: failed to create topic: " << message->topic << " error: "
@@ -444,17 +447,15 @@ private:
}
return;
}
- // TODO use the topics list as an LRU cache
- conn->topics.push_back(topic);
+ topic_it = conn->topics.emplace(message->topic, std::move(topic)).first;
ldout(conn->cct, 20) << "Kafka publish: successfully created topic: " << message->topic << dendl;
} else {
- topic = *topic_it;
ldout(conn->cct, 20) << "Kafka publish: reused existing topic: " << message->topic << dendl;
}
const auto tag = (message->cb == nullptr ? nullptr : new uint64_t(conn->delivery_tag++));
const auto rc = rd_kafka_produce(
- topic,
+ topic_it->second.get(),
// TODO: non builtin partitioning
RD_KAFKA_PARTITION_UA,
// make a copy of the payload
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
index fb3002b0a38..0c80ad1b765 100644
--- a/src/rgw/rgw_lc.cc
+++ b/src/rgw/rgw_lc.cc
@@ -258,13 +258,13 @@ void RGWLC::finalize()
delete[] obj_names;
}
-static inline std::ostream& operator<<(std::ostream &os, rgw::sal::Lifecycle::LCEntry& ent) {
+static inline std::ostream& operator<<(std::ostream &os, rgw::sal::LCEntry& ent) {
os << "<ent: bucket=";
- os << ent.get_bucket();
+ os << ent.bucket;
os << "; start_time=";
- os << rgw_to_asctime(utime_t(time_t(ent.get_start_time()), 0));
+ os << rgw_to_asctime(utime_t(ent.start_time, 0));
os << "; status=";
- os << LC_STATUS[ent.get_status()];
+ os << LC_STATUS[ent.status];
os << ">";
return os;
}
@@ -1834,7 +1834,7 @@ public:
};
int RGWLC::bucket_lc_post(int index, int max_lock_sec,
- rgw::sal::Lifecycle::LCEntry& entry, int& result,
+ rgw::sal::LCEntry& entry, int& result,
LCWorker* worker)
{
utime_t lock_duration(cct->_conf->rgw_lc_lock_max_time, 0);
@@ -1872,9 +1872,9 @@ int RGWLC::bucket_lc_post(int index, int max_lock_sec,
}
goto clean;
} else if (result < 0) {
- entry.set_status(lc_failed);
+ entry.status = lc_failed;
} else {
- entry.set_status(lc_complete);
+ entry.status = lc_complete;
}
ret = sal_lc->set_entry(this, null_yield, obj_names[index], entry);
@@ -1891,12 +1891,12 @@ clean:
} /* RGWLC::bucket_lc_post */
int RGWLC::list_lc_progress(string& marker, uint32_t max_entries,
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+ vector<rgw::sal::LCEntry>& progress_map,
int& index)
{
progress_map.clear();
for(; index < max_objs; index++, marker="") {
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> entries;
+ vector<rgw::sal::LCEntry> entries;
int ret = sal_lc->list_entries(this, null_yield, obj_names[index],
marker, max_entries, entries);
if (ret < 0) {
@@ -1914,7 +1914,7 @@ int RGWLC::list_lc_progress(string& marker, uint32_t max_entries,
/* update index, marker tuple */
if (progress_map.size() > 0)
- marker = progress_map.back()->get_bucket();
+ marker = progress_map.back().bucket;
if (progress_map.size() >= max_entries)
break;
@@ -2034,7 +2034,6 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
std::unique_ptr<rgw::sal::LCSerializer> serializer =
sal_lc->get_serializer(lc_index_lock_name, obj_names[index],
worker->thr_name());
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
if (max_lock_secs <= 0) {
return -EAGAIN;
}
@@ -2053,11 +2052,12 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
std::unique_lock<rgw::sal::LCSerializer> lock(
*(serializer.get()), std::adopt_lock);
+ rgw::sal::LCEntry entry;
ret = sal_lc->get_entry(this, null_yield, obj_names[index],
- bucket_entry_marker, &entry);
+ bucket_entry_marker, entry);
if (ret >= 0) {
- if (entry->get_status() == lc_processing) {
- if (expired_session(entry->get_start_time())) {
+ if (entry.status == lc_processing) {
+ if (expired_session(entry.start_time)) {
ldpp_dout(this, 5) << "RGWLC::process_bucket(): STALE lc session found for: " << entry
<< " index: " << index << " worker ix: " << worker->ix
<< " (clearing)"
@@ -2074,7 +2074,7 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
}
/* do nothing if no bucket */
- if ((! entry) || entry->get_bucket().empty()) {
+ if ((ret < 0) || entry.bucket.empty()) {
return ret;
}
@@ -2082,11 +2082,11 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
<< " index: " << index << " worker ix: " << worker->ix
<< dendl;
- entry->set_status(lc_processing);
- ret = sal_lc->set_entry(this, null_yield, obj_names[index], *entry);
+ entry.status = lc_processing;
+ ret = sal_lc->set_entry(this, null_yield, obj_names[index], entry);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process_bucket() failed to set obj entry "
- << obj_names[index] << entry->get_bucket() << entry->get_status()
+ << obj_names[index] << entry.bucket << entry.status
<< dendl;
return ret;
}
@@ -2096,10 +2096,10 @@ int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
<< dendl;
lock.unlock();
- ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once);
+ ret = bucket_lc_process(entry.bucket, worker, thread_stop_at(), once);
ldpp_dout(this, 5) << "RGWLC::process_bucket(): END entry 2: " << entry
<< " index: " << index << " worker ix: " << worker->ix << " ret: " << ret << dendl;
- bucket_lc_post(index, max_lock_secs, *entry, ret, worker);
+ bucket_lc_post(index, max_lock_secs, entry, ret, worker);
return ret;
} /* RGWLC::process_bucket */
@@ -2144,15 +2144,15 @@ static inline bool already_run_today(CephContext* cct, time_t start_date)
} /* already_run_today */
inline int RGWLC::advance_head(const std::string& lc_shard,
- rgw::sal::Lifecycle::LCHead& head,
- rgw::sal::Lifecycle::LCEntry& entry,
+ rgw::sal::LCHead& head,
+ const rgw::sal::LCEntry& entry,
time_t start_date)
{
int ret{0};
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> next_entry;
+ rgw::sal::LCEntry next_entry;
ret = sal_lc->get_next_entry(this, null_yield, lc_shard,
- entry.get_bucket(), &next_entry);
+ entry.bucket, next_entry);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() failed to get obj entry "
<< lc_shard << dendl;
@@ -2160,8 +2160,8 @@ inline int RGWLC::advance_head(const std::string& lc_shard,
}
/* save the next position */
- head.set_marker(next_entry->get_bucket());
- head.set_start_date(start_date);
+ head.marker = next_entry.bucket;
+ head.start_date = start_date;
ret = sal_lc->put_head(this, null_yield, lc_shard, head);
if (ret < 0) {
@@ -2175,17 +2175,17 @@ exit:
} /* advance head */
inline int RGWLC::check_if_shard_done(const std::string& lc_shard,
- rgw::sal::Lifecycle::LCHead& head, int worker_ix)
+ rgw::sal::LCHead& head, int worker_ix)
{
int ret{0};
- if (head.get_marker().empty()) {
+ if (head.marker.empty()) {
/* done with this shard */
ldpp_dout(this, 5) <<
"RGWLC::process() next_entry not found. cycle finished lc_shard="
<< lc_shard << " worker=" << worker_ix
<< dendl;
- head.set_shard_rollover_date(ceph_clock_now());
+ head.shard_rollover_date = ceph_clock_now();
ret = sal_lc->put_head(this, null_yield, lc_shard, head);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() failed to put head "
@@ -2198,8 +2198,8 @@ inline int RGWLC::check_if_shard_done(const std::string& lc_shard,
}
inline int RGWLC::update_head(const std::string& lc_shard,
- rgw::sal::Lifecycle::LCHead& head,
- rgw::sal::Lifecycle::LCEntry& entry,
+ rgw::sal::LCHead& head,
+ rgw::sal::LCEntry& entry,
time_t start_date, int worker_ix)
{
int ret{0};
@@ -2229,8 +2229,8 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
int ret{0};
const auto& lc_shard = obj_names[index];
- std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry; //string = bucket_name:bucket_id, start_time, int = LC_BUCKET_STATUS
+ rgw::sal::LCHead head;
+ rgw::sal::LCEntry entry; //string = bucket_name:bucket_id, start_time, int = LC_BUCKET_STATUS
ldpp_dout(this, 5) << "RGWLC::process(): ENTER: "
<< "index: " << index << " worker ix: " << worker->ix
@@ -2264,7 +2264,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
utime_t now = ceph_clock_now();
/* preamble: find an inital bucket/marker */
- ret = sal_lc->get_head(this, null_yield, lc_shard, &head);
+ ret = sal_lc->get_head(this, null_yield, lc_shard, head);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() failed to get obj head "
<< lc_shard << ", ret=" << ret << dendl;
@@ -2273,18 +2273,18 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
/* if there is nothing at head, try to reinitialize head.marker with the
* first entry in the queue */
- if (head->get_marker().empty() &&
- allow_shard_rollover(cct, now, head->get_shard_rollover_date()) /* prevent multiple passes by diff.
+ if (head.marker.empty() &&
+ allow_shard_rollover(cct, now, head.shard_rollover_date) /* prevent multiple passes by diff.
* rgws,in same cycle */) {
ldpp_dout(this, 5) << "RGWLC::process() process shard rollover lc_shard=" << lc_shard
- << " head.marker=" << head->get_marker()
- << " head.shard_rollover_date=" << head->get_shard_rollover_date()
+ << " head.marker=" << head.marker
+ << " head.shard_rollover_date=" << head.shard_rollover_date
<< dendl;
- vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> entries;
+ vector<rgw::sal::LCEntry> entries;
int ret = sal_lc->list_entries(this, null_yield, lc_shard,
- head->get_marker(), 1, entries);
+ head.marker, 1, entries);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() sal_lc->list_entries(lc_shard, head.marker, 1, "
<< "entries) returned error ret==" << ret << dendl;
@@ -2292,27 +2292,27 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
}
if (entries.size() > 0) {
entry = std::move(entries.front());
- head->set_marker(entry->get_bucket());
- head->set_start_date(now);
- head->set_shard_rollover_date(0);
+ head.marker = entry.bucket;
+ head.start_date= now;
+ head.shard_rollover_date = 0;
}
} else {
ldpp_dout(this, 0) << "RGWLC::process() head.marker !empty() at START for shard=="
<< lc_shard << " head last stored at "
- << rgw_to_asctime(utime_t(time_t(head->get_start_date()), 0))
+ << rgw_to_asctime(utime_t(head.start_date, 0))
<< dendl;
/* fetches the entry pointed to by head.bucket */
ret = sal_lc->get_entry(this, null_yield, lc_shard,
- head->get_marker(), &entry);
+ head.marker, entry);
if (ret == -ENOENT) {
/* skip to next entry */
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> tmp_entry = sal_lc->get_entry();
- tmp_entry->set_bucket(head->get_marker());
+ rgw::sal::LCEntry tmp_entry;
+ tmp_entry.bucket = head.marker;
- if (update_head(lc_shard, *head.get(), *tmp_entry.get(), now, worker->ix) != 0) {
- goto exit;
- }
+ if (update_head(lc_shard, head, tmp_entry, now, worker->ix) != 0) {
+ goto exit;
+ }
continue;
}
if (ret < 0) {
@@ -2322,9 +2322,9 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
}
}
- if (entry && !entry->get_bucket().empty()) {
- if (entry->get_status() == lc_processing) {
- if (expired_session(entry->get_start_time())) {
+ if (!entry.bucket.empty()) {
+ if (entry.status == lc_processing) {
+ if (expired_session(entry.start_time)) {
ldpp_dout(this, 5)
<< "RGWLC::process(): STALE lc session found for: " << entry
<< " index: " << index << " worker ix: " << worker->ix
@@ -2334,19 +2334,19 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
<< "RGWLC::process(): ACTIVE entry: " << entry
<< " index: " << index << " worker ix: " << worker->ix << dendl;
/* skip to next entry */
- if (update_head(lc_shard, *head.get(), *entry.get(), now, worker->ix) != 0) {
+ if (update_head(lc_shard, head, entry, now, worker->ix) != 0) {
goto exit;
}
continue;
}
} else {
- if ((entry->get_status() == lc_complete) &&
- already_run_today(cct, entry->get_start_time())) {
+ if ((entry.status == lc_complete) &&
+ already_run_today(cct, entry.start_time)) {
ldpp_dout(this, 5) << "RGWLC::process() worker ix: " << worker->ix
- << " SKIP processing for already-processed bucket " << entry->get_bucket()
+ << " SKIP processing for already-processed bucket " << entry.bucket
<< dendl;
/* skip to next entry */
- if (update_head(lc_shard, *head.get(), *entry.get(), now, worker->ix) != 0) {
+ if (update_head(lc_shard, head, entry, now, worker->ix) != 0) {
goto exit;
}
continue;
@@ -2368,18 +2368,18 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
<< " index: " << index << " worker ix: " << worker->ix
<< dendl;
- entry->set_status(lc_processing);
- entry->set_start_time(now);
+ entry.status = lc_processing;
+ entry.start_time = now;
- ret = sal_lc->set_entry(this, null_yield, lc_shard, *entry);
+ ret = sal_lc->set_entry(this, null_yield, lc_shard, entry);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() failed to set obj entry "
- << lc_shard << entry->get_bucket() << entry->get_status() << dendl;
+ << lc_shard << entry.bucket << entry.status << dendl;
goto exit;
}
/* advance head for next waiter, then process */
- if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) {
+ if (advance_head(lc_shard, head, entry, now) < 0) {
goto exit;
}
@@ -2390,7 +2390,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
/* drop lock so other instances can make progress while this
* bucket is being processed */
lock->unlock();
- ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once);
+ ret = bucket_lc_process(entry.bucket, worker, thread_stop_at(), once);
ldpp_dout(this, 5) << "RGWLC::process(): END entry 2: " << entry
<< " index: " << index << " worker ix: " << worker->ix << " ret: " << ret << dendl;
@@ -2407,7 +2407,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
/* XXXX are we SURE the only way result could == ENOENT is when
* there is no such bucket? It is currently the value returned
* from bucket_lc_process(...) */
- ret = sal_lc->rm_entry(this, null_yield, lc_shard, *entry);
+ ret = sal_lc->rm_entry(this, null_yield, lc_shard, entry);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() failed to remove entry "
<< lc_shard << " (nonfatal)"
@@ -2416,11 +2416,11 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
}
} else {
if (ret < 0) {
- entry->set_status(lc_failed);
+ entry.status = lc_failed;
} else {
- entry->set_status(lc_complete);
+ entry.status = lc_complete;
}
- ret = sal_lc->set_entry(this, null_yield, lc_shard, *entry);
+ ret = sal_lc->set_entry(this, null_yield, lc_shard, entry);
if (ret < 0) {
ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on lc_shard="
<< lc_shard << " entry=" << entry
@@ -2430,7 +2430,7 @@ int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
}
}
- if (check_if_shard_done(lc_shard, *head.get(), worker->ix) != 0 ) {
+ if (check_if_shard_done(lc_shard, head, worker->ix) != 0 ) {
goto exit;
}
} while(1 && !once && !going_down());
@@ -2569,9 +2569,9 @@ static int guard_lc_modify(const DoutPrefixProvider *dpp,
get_lc_oid(cct, bucket_lc_key, &oid);
/* XXX it makes sense to take shard_id for a bucket_id? */
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry = sal_lc->get_entry();
- entry->set_bucket(bucket_lc_key);
- entry->set_status(lc_uninitial);
+ rgw::sal::LCEntry entry;
+ entry.bucket = bucket_lc_key;
+ entry.status = lc_uninitial;
int max_lock_secs = cct->_conf->rgw_lc_lock_max_time;
std::unique_ptr<rgw::sal::LCSerializer> lock =
@@ -2598,7 +2598,7 @@ static int guard_lc_modify(const DoutPrefixProvider *dpp,
<< oid << ", ret=" << ret << dendl;
break;
}
- ret = f(sal_lc, oid, *entry.get());
+ ret = f(sal_lc, oid, entry);
if (ret < 0) {
ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to set entry on "
<< oid << ", ret=" << ret << dendl;
@@ -2634,7 +2634,7 @@ int RGWLC::set_bucket_config(const DoutPrefixProvider* dpp, optional_yield y,
ret = guard_lc_modify(dpp, driver, sal_lc.get(), b, cookie,
[&](rgw::sal::Lifecycle* sal_lc, const string& oid,
- rgw::sal::Lifecycle::LCEntry& entry) {
+ rgw::sal::LCEntry& entry) {
return sal_lc->set_entry(dpp, y, oid, entry);
});
@@ -2663,7 +2663,7 @@ int RGWLC::remove_bucket_config(const DoutPrefixProvider* dpp, optional_yield y,
ret = guard_lc_modify(dpp, driver, sal_lc.get(), b, cookie,
[&](rgw::sal::Lifecycle* sal_lc, const string& oid,
- rgw::sal::Lifecycle::LCEntry& entry) {
+ rgw::sal::LCEntry& entry) {
return sal_lc->rm_entry(dpp, y, oid, entry);
});
@@ -2692,13 +2692,13 @@ int fix_lc_shard_entry(const DoutPrefixProvider *dpp,
std::string lc_oid;
get_lc_oid(driver->ctx(), bucket_lc_key, &lc_oid);
- std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
+ rgw::sal::LCEntry entry;
// There are multiple cases we need to encounter here
// 1. entry exists and is already set to marker, happens in plain buckets & newly resharded buckets
// 2. entry doesn't exist, which usually happens when reshard has happened prior to update and next LC process has already dropped the update
// 3. entry exists matching the current bucket id which was after a reshard (needs to be updated to the marker)
// We are not dropping the old marker here as that would be caught by the next LC process update
- int ret = sal_lc->get_entry(dpp, null_yield, lc_oid, bucket_lc_key, &entry);
+ int ret = sal_lc->get_entry(dpp, null_yield, lc_oid, bucket_lc_key, entry);
if (ret == 0) {
ldpp_dout(dpp, 5) << "Entry already exists, nothing to do" << dendl;
return ret; // entry is already existing correctly set to marker
@@ -2716,7 +2716,7 @@ int fix_lc_shard_entry(const DoutPrefixProvider *dpp,
driver, sal_lc, bucket->get_key(), cookie,
[dpp, &lc_oid](rgw::sal::Lifecycle* slc,
const string& oid,
- rgw::sal::Lifecycle::LCEntry& entry) {
+ rgw::sal::LCEntry& entry) {
return slc->set_entry(dpp, null_yield, lc_oid, entry);
});
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h
index d53e14cee65..b4c6ad4a86b 100644
--- a/src/rgw/rgw_lc.h
+++ b/src/rgw/rgw_lc.h
@@ -628,15 +628,15 @@ public:
const std::unique_ptr<rgw::sal::Bucket>& optional_bucket,
bool once);
int advance_head(const std::string& lc_shard,
- rgw::sal::Lifecycle::LCHead& head,
- rgw::sal::Lifecycle::LCEntry& entry,
+ rgw::sal::LCHead& head,
+ const rgw::sal::LCEntry& entry,
time_t start_date);
int check_if_shard_done(const std::string& lc_shard,
- rgw::sal::Lifecycle::LCHead& head,
+ rgw::sal::LCHead& head,
int worker_ix);
int update_head(const std::string& lc_shard,
- rgw::sal::Lifecycle::LCHead& head,
- rgw::sal::Lifecycle::LCEntry& entry,
+ rgw::sal::LCHead& head,
+ rgw::sal::LCEntry& entry,
time_t start_date, int worker_ix);
int process(int index, int max_lock_secs, LCWorker* worker, bool once);
int process_bucket(int index, int max_lock_secs, LCWorker* worker,
@@ -644,12 +644,12 @@ public:
bool expired_session(time_t started);
time_t thread_stop_at();
int list_lc_progress(std::string& marker, uint32_t max_entries,
- std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>&,
+ std::vector<rgw::sal::LCEntry>&,
int& index);
int bucket_lc_process(std::string& shard_id, LCWorker* worker, time_t stop_at,
bool once);
int bucket_lc_post(int index, int max_lock_sec,
- rgw::sal::Lifecycle::LCEntry& entry, int& result, LCWorker* worker);
+ rgw::sal::LCEntry& entry, int& result, LCWorker* worker);
bool going_down();
void start_processor();
void stop_processor();
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 870dd1cee90..b54805bdc7d 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -6703,15 +6703,61 @@ void RGWCompleteMultipart::execute(optional_yield y)
return;
}
+ RGWObjVersionTracker& objv_tracker = meta_obj->get_version_tracker();
+
+ using prefix_map_t = rgw::sal::MultipartUpload::prefix_map_t;
+ prefix_map_t processed_prefixes;
+
op_ret =
upload->complete(this, y, s->cct, parts->parts, remove_objs, accounted_size,
- compressed, cs_info, ofs, s->req_id, s->owner, olh_epoch,
- s->object.get());
+ compressed, cs_info, ofs, s->req_id, s->owner, olh_epoch,
+ s->object.get(), processed_prefixes);
if (op_ret < 0) {
ldpp_dout(this, 0) << "ERROR: upload complete failed ret=" << op_ret << dendl;
return;
}
+ remove_objs.clear();
+
+ // use cls_version_check() when deleting the meta object to detect part uploads that raced
+ // with upload->complete(). any parts that finish after that won't be part of the final
+ // upload, so they need to be gc'd and removed from the bucket index before retrying
+ // deletion of the multipart meta object
+ static constexpr auto MAX_DELETE_RETRIES = 15u;
+ for (auto i = 0u; i < MAX_DELETE_RETRIES; i++) {
+ // remove the upload meta object ; the meta object is not versioned
+ // when the bucket is, as that would add an unneeded delete marker
+ int ret = meta_obj->delete_object(this, y, rgw::sal::FLAG_PREVENT_VERSIONING, &remove_objs, &objv_tracker);
+ if (ret != -ECANCELED || i == MAX_DELETE_RETRIES - 1) {
+ if (ret >= 0) {
+ /* serializer's exclusive lock is released */
+ serializer->clear_locked();
+ } else {
+ ldpp_dout(this, 1) << "ERROR: failed to remove object " << meta_obj << ", ret: " << ret << dendl;
+ }
+ break;
+ }
+
+ ldpp_dout(this, 20) << "deleting meta_obj is cancelled due to mismatch cls_version: " << objv_tracker << dendl;
+ objv_tracker.clear();
+
+ ret = meta_obj->get_obj_attrs(s->yield, this);
+ if (ret < 0) {
+ ldpp_dout(this, 1) << "ERROR: failed to get obj attrs, obj=" << meta_obj
+ << " ret=" << ret << dendl;
+
+ if (ret != -ENOENT) {
+ ldpp_dout(this, 0) << "ERROR: failed to remove object " << meta_obj << dendl;
+ }
+ break;
+ }
+
+ ret = upload->cleanup_orphaned_parts(this, s->cct, y, meta_obj->get_obj(), remove_objs, processed_prefixes);
+ if (ret < 0) {
+ ldpp_dout(this, 0) << "ERROR: failed to clenup orphaned parts. ret=" << ret << dendl;
+ }
+ }
+
const ceph::real_time upload_time = upload->get_mtime();
etag = s->object->get_attrs()[RGW_ATTR_ETAG].to_str();
@@ -6721,17 +6767,6 @@ void RGWCompleteMultipart::execute(optional_yield y)
ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
// too late to rollback operation, hence op_ret is not set here
}
-
- // remove the upload meta object ; the meta object is not versioned
- // when the bucket is, as that would add an unneeded delete marker
- ret = meta_obj->delete_object(this, y, rgw::sal::FLAG_PREVENT_VERSIONING);
- if (ret >= 0) {
- /* serializer's exclusive lock is released */
- serializer->clear_locked();
- } else {
- ldpp_dout(this, 4) << "WARNING: failed to remove object " << meta_obj << ", ret: " << ret << dendl;
- }
-
} // RGWCompleteMultipart::execute
bool RGWCompleteMultipart::check_previously_completed(const RGWMultiCompleteUpload* parts)
diff --git a/src/rgw/rgw_process.cc b/src/rgw/rgw_process.cc
index 14d6d26f7d2..8be7be79069 100644
--- a/src/rgw/rgw_process.cc
+++ b/src/rgw/rgw_process.cc
@@ -460,20 +460,24 @@ done:
} else {
ldpp_dout(s, 2) << "http status=" << s->err.http_ret << dendl;
}
- if (handler)
- handler->put_op(op);
- rest->put_handler(handler);
const auto lat = s->time_elapsed();
if (latency) {
*latency = lat;
}
dout(1) << "====== req done req=" << hex << req << dec
- << " op status=" << op_ret
- << " http_status=" << s->err.http_ret
- << " latency=" << lat
- << " ======"
- << dendl;
+ << " op=" << (op ? op->name() : "unknown")
+ << " bucket=" << s->bucket_name
+ << " status=" << op_ret
+ << " http_status=" << s->err.http_ret
+ << " latency=" << lat
+ << " request_id=" << s->trans_id
+ << " ======"
+ << dendl;
+
+ if (handler)
+ handler->put_op(op);
+ rest->put_handler(handler);
return (ret < 0 ? ret : s->err.ret);
} /* process_request */
diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h
index 8ddfb7f7c7d..d6129165689 100644
--- a/src/rgw/rgw_sal.h
+++ b/src/rgw/rgw_sal.h
@@ -1096,6 +1096,7 @@ class Object {
rgw_zone_set* zones_trace{nullptr};
bool abortmp{false};
uint64_t parts_accounted_size{0};
+ RGWObjVersionTracker* objv_tracker = nullptr;
} params;
struct Result {
@@ -1115,7 +1116,9 @@ class Object {
/** Shortcut synchronous delete call for common deletes */
virtual int delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags) = 0;
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) = 0;
/** Copy an this object to another object. */
virtual int copy_object(const ACLOwner& owner, const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
@@ -1278,6 +1281,9 @@ class Object {
virtual int get_torrent_info(const DoutPrefixProvider* dpp,
optional_yield y, bufferlist& bl) = 0;
+ /** Get the version tracker for this object */
+ virtual RGWObjVersionTracker& get_version_tracker() = 0;
+
/** Get the OMAP values matching the given set of keys */
virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
const std::set<std::string>& keys,
@@ -1361,6 +1367,8 @@ public:
*/
class MultipartUpload {
public:
+ using prefix_map_t = boost::container::flat_map<uint32_t, boost::container::flat_set<std::string>>;
+
//object lock
std::optional<RGWObjectRetention> obj_retention = std::nullopt;
std::optional<RGWObjectLegalHold> obj_legal_hold = std::nullopt;
@@ -1407,7 +1415,14 @@ public:
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj) = 0;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) = 0;
+ /** Cleanup orphaned parts caused by racing condition involving part upload retry */
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) = 0;
/** Get placement and/or attribute info for this upload */
virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) = 0;
@@ -1491,6 +1506,21 @@ public:
virtual ~LCSerializer() = default;
};
+/** Head of a lifecycle run. Used for tracking parallel lifecycle runs. */
+struct LCHead {
+ time_t start_date = 0;
+ std::string marker;
+ time_t shard_rollover_date = 0;
+};
+
+/** Single entry in a lifecycle run. Multiple entries can exist processing different
+ * buckets. */
+struct LCEntry {
+ std::string bucket;
+ uint64_t start_time = 0;
+ uint32_t status = 0;
+};
+
/**
* @brief Abstraction for lifecycle processing
*
@@ -1500,84 +1530,34 @@ public:
*/
class Lifecycle {
public:
- /** Head of a lifecycle run. Used for tracking parallel lifecycle runs. */
- struct LCHead {
- LCHead() = default;
- virtual ~LCHead() = default;
-
- virtual time_t& get_start_date() = 0;
- virtual void set_start_date(time_t) = 0;
- virtual std::string& get_marker() = 0;
- virtual void set_marker(const std::string&) = 0;
- virtual time_t& get_shard_rollover_date() = 0;
- virtual void set_shard_rollover_date(time_t) = 0;
- };
-
- /** Single entry in a lifecycle run. Multiple entries can exist processing different
- * buckets. */
- struct LCEntry {
- LCEntry() = default;
- virtual ~LCEntry() = default;
-
- virtual std::string& get_bucket() = 0;
- virtual void set_bucket(const std::string&) = 0;
- virtual std::string& get_oid() = 0;
- virtual void set_oid(const std::string&) = 0;
- virtual uint64_t get_start_time() = 0;
- virtual void set_start_time(uint64_t) = 0;
- virtual uint32_t get_status() = 0;
- virtual void set_status(uint32_t) = 0;
-
- /** Print the entry to @a out */
- virtual void print(std::ostream& out) const = 0;
-
- friend inline std::ostream& operator<<(std::ostream& out, const LCEntry& e) {
- e.print(out);
- return out;
- }
- friend inline std::ostream& operator<<(std::ostream& out, const LCEntry* e) {
- if (!e)
- out << "<NULL>";
- else
- e->print(out);
- return out;
- }
- friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr<LCEntry>& p) {
- out << p.get();
- return out;
- }
- };
-
Lifecycle() = default;
virtual ~Lifecycle() = default;
- /** Get an empty entry */
- virtual std::unique_ptr<LCEntry> get_entry() = 0;
/** Get an entry matching the given marker */
virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) = 0;
+ LCEntry& entry) = 0;
/** Get the entry following the given marker */
virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) = 0;
+ LCEntry& entry) = 0;
/** Store a modified entry in then backing store */
virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) = 0;
+ const std::string& oid, const LCEntry& entry) = 0;
/** List all known entries */
virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
uint32_t max_entries,
- std::vector<std::unique_ptr<LCEntry>>& entries) = 0;
+ std::vector<LCEntry>& entries) = 0;
/** Remove an entry from the backing store */
virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) = 0;
+ const std::string& oid, const LCEntry& entry) = 0;
/** Get a head */
virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head) = 0;
+ const std::string& oid, LCHead& head) = 0;
/** Store a modified head to the backing store */
virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCHead& head) = 0;
+ const std::string& oid, const LCHead& head) = 0;
/** Get a serializer for lifecycle */
virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc
index 51627914db2..5f8bab70817 100644
--- a/src/rgw/rgw_sal_dbstore.cc
+++ b/src/rgw/rgw_sal_dbstore.cc
@@ -715,7 +715,11 @@ namespace rgw::sal {
return ret;
}
- int DBObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, uint32_t flags)
+ int DBObject::delete_object(const DoutPrefixProvider* dpp,
+ optional_yield y,
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv)
{
DB::Object del_target(store->getDB(), bucket->get_info(), get_obj());
DB::Object::Delete del_op(&del_target);
@@ -907,7 +911,8 @@ namespace rgw::sal {
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj)
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes)
{
char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
@@ -1017,6 +1022,15 @@ namespace rgw::sal {
return ret;
}
+ int DBMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes)
+ {
+ return -ENOTSUP;
+ }
+
int DBMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
{
if (!rule && !attrs) {
@@ -1840,45 +1854,45 @@ namespace rgw::sal {
int DBLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry)
+ LCEntry& entry)
{
return store->getDB()->get_entry(oid, marker, entry);
}
int DBLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry)
+ LCEntry& entry)
{
return store->getDB()->get_next_entry(oid, marker, entry);
}
int DBLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry)
+ const std::string& oid, const LCEntry& entry)
{
return store->getDB()->set_entry(oid, entry);
}
int DBLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- uint32_t max_entries, vector<std::unique_ptr<LCEntry>>& entries)
+ uint32_t max_entries, vector<LCEntry>& entries)
{
return store->getDB()->list_entries(oid, marker, max_entries, entries);
}
int DBLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry)
+ const std::string& oid, const LCEntry& entry)
{
return store->getDB()->rm_entry(oid, entry);
}
int DBLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head)
+ const std::string& oid, LCHead& head)
{
return store->getDB()->get_head(oid, head);
}
int DBLifecycle::put_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCHead& head)
+ const std::string& oid, const LCHead& head)
{
return store->getDB()->put_head(oid, head);
}
diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h
index f163b874eb0..107ba735a63 100644
--- a/src/rgw/rgw_sal_dbstore.h
+++ b/src/rgw/rgw_sal_dbstore.h
@@ -38,31 +38,30 @@ public:
}
};
-class DBLifecycle : public StoreLifecycle {
+class DBLifecycle : public Lifecycle {
DBStore* store;
public:
DBLifecycle(DBStore* _st) : store(_st) {}
- using StoreLifecycle::get_entry;
virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) override;
+ LCEntry& entry) override;
virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) override;
+ LCEntry& entry) override;
virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) override;
+ const std::string& oid, const LCEntry& entry) override;
virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
uint32_t max_entries,
- std::vector<std::unique_ptr<LCEntry>>& entries) override;
+ std::vector<LCEntry>& entries) override;
virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) override;
+ const std::string& oid, const LCEntry& entry) override;
virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head) override;
- virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, LCHead& head) override;
+ virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
+ const std::string& oid, const LCHead& head) override;
virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
const std::string& oid,
const std::string& cookie) override;
@@ -460,7 +459,13 @@ protected:
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj) override;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) override;
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) override;
virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
optional_yield y,
@@ -532,7 +537,9 @@ protected:
virtual int delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags) override;
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) override;
virtual int copy_object(const ACLOwner& owner,
const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc
index 5afefa3d258..272862cb7e1 100644
--- a/src/rgw/rgw_sal_filter.cc
+++ b/src/rgw/rgw_sal_filter.cc
@@ -994,9 +994,11 @@ int FilterBucket::abort_multiparts(const DoutPrefixProvider* dpp, CephContext* c
int FilterObject::delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags)
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv)
{
- return next->delete_object(dpp, y, flags);
+ return next->delete_object(dpp, y, flags, remove_objs, objv);
}
int FilterObject::copy_object(const ACLOwner& owner,
@@ -1282,11 +1284,21 @@ int FilterMultipartUpload::complete(const DoutPrefixProvider *dpp,
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj)
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes)
{
return next->complete(dpp, y, cct, part_etags, remove_objs, accounted_size,
compressed, cs_info, ofs, tag, owner, olh_epoch,
- nextObject(target_obj));
+ nextObject(target_obj), processed_prefixes);
+}
+
+int FilterMultipartUpload::cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes)
+{
+ return next->cleanup_orphaned_parts(dpp, cct, y, obj, remove_objs, processed_prefixes);
}
int FilterMultipartUpload::get_info(const DoutPrefixProvider *dpp,
@@ -1324,40 +1336,22 @@ int FilterLCSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur,
return next->try_lock(dpp, dur, y);
}
-std::unique_ptr<Lifecycle::LCEntry> FilterLifecycle::get_entry()
-{
- std::unique_ptr<Lifecycle::LCEntry> e = next->get_entry();
- return std::make_unique<FilterLCEntry>(std::move(e));
-}
-
int FilterLifecycle::get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry)
+ LCEntry& entry)
{
- std::unique_ptr<LCEntry> ne;
- int ret = next->get_entry(dpp, y, oid, marker, &ne);
- if (ret < 0)
- return ret;
-
- *entry = std::make_unique<FilterLCEntry>(std::move(ne));
- return 0;
+ return next->get_entry(dpp, y, oid, marker, entry);
}
int FilterLifecycle::get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry)
+ LCEntry& entry)
{
- std::unique_ptr<LCEntry> ne;
- int ret = next->get_next_entry(dpp, y, oid, marker, &ne);
- if (ret < 0)
- return ret;
-
- *entry = std::make_unique<FilterLCEntry>(std::move(ne));
- return 0;
+ return next->get_next_entry(dpp, y, oid, marker, entry);
}
int FilterLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry)
+ const std::string& oid, const LCEntry& entry)
{
return next->set_entry(dpp, y, oid, entry);
}
@@ -1365,42 +1359,27 @@ int FilterLifecycle::set_entry(const DoutPrefixProvider* dpp, optional_yield y,
int FilterLifecycle::list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
uint32_t max_entries,
- std::vector<std::unique_ptr<LCEntry>>& entries)
+ std::vector<LCEntry>& entries)
{
- std::vector<std::unique_ptr<LCEntry>> ne;
- int ret = next->list_entries(dpp, y, oid, marker, max_entries, ne);
- if (ret < 0)
- return ret;
-
- for (auto& ent : ne) {
- entries.emplace_back(std::make_unique<FilterLCEntry>(std::move(ent)));
- }
-
- return 0;
+ return next->list_entries(dpp, y, oid, marker, max_entries, entries);
}
int FilterLifecycle::rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry)
+ const std::string& oid, const LCEntry& entry)
{
return next->rm_entry(dpp, y, oid, entry);
}
int FilterLifecycle::get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head)
+ const std::string& oid, LCHead& head)
{
- std::unique_ptr<LCHead> nh;
- int ret = next->get_head(dpp, y, oid, &nh);
- if (ret < 0)
- return ret;
-
- *head = std::make_unique<FilterLCHead>(std::move(nh));
- return 0;
+ return next->get_head(dpp, y, oid, head);
}
int FilterLifecycle::put_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCHead& head)
+ const std::string& oid, const LCHead& head)
{
- return next->put_head(dpp, y, oid, *(dynamic_cast<FilterLCHead&>(head).next.get()));
+ return next->put_head(dpp, y, oid, head);
}
std::unique_ptr<LCSerializer> FilterLifecycle::get_serializer(
diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h
index fc2e3aeffa3..b12ea53a9bb 100644
--- a/src/rgw/rgw_sal_filter.h
+++ b/src/rgw/rgw_sal_filter.h
@@ -727,7 +727,9 @@ public:
virtual int delete_object(const DoutPrefixProvider* dpp,
optional_yield y,
- uint32_t flags) override;
+ uint32_t flags,
+ std::list<rgw_obj_index_key>* remove_objs,
+ RGWObjVersionTracker* objv) override;
virtual int copy_object(const ACLOwner& owner,
const rgw_user& remote_user,
req_info* info, const rgw_zone_id& source_zone,
@@ -843,6 +845,8 @@ public:
virtual int get_torrent_info(const DoutPrefixProvider* dpp,
optional_yield y, bufferlist& bl) override;
+ virtual RGWObjVersionTracker& get_version_tracker() override { return next->get_version_tracker(); }
+
virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
const std::string& oid,
const std::set<std::string>& keys,
@@ -920,7 +924,13 @@ public:
RGWCompressionInfo& cs_info, off_t& ofs,
std::string& tag, ACLOwner& owner,
uint64_t olh_epoch,
- rgw::sal::Object* target_obj) override;
+ rgw::sal::Object* target_obj,
+ prefix_map_t& processed_prefixes) override;
+ virtual int cleanup_orphaned_parts(const DoutPrefixProvider *dpp,
+ CephContext *cct, optional_yield y,
+ const rgw_obj& obj,
+ std::list<rgw_obj_index_key>& remove_objs,
+ prefix_map_t& processed_prefixes) override;
virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y,
rgw_placement_rule** rule,
@@ -969,59 +979,28 @@ protected:
std::unique_ptr<Lifecycle> next;
public:
- struct FilterLCHead : LCHead {
- std::unique_ptr<LCHead> next;
-
- FilterLCHead(std::unique_ptr<LCHead> _next) : next(std::move(_next)) {}
- virtual ~FilterLCHead() = default;
-
- virtual time_t& get_start_date() override { return next->get_start_date(); }
- virtual void set_start_date(time_t t) override { next->set_start_date(t); }
- virtual std::string& get_marker() override { return next->get_marker(); }
- virtual void set_marker(const std::string& m) override { next->set_marker(m); }
- virtual time_t& get_shard_rollover_date() override { return next->get_shard_rollover_date(); }
- virtual void set_shard_rollover_date(time_t t) override { next->set_shard_rollover_date(t); }
- };
-
- struct FilterLCEntry : LCEntry {
- std::unique_ptr<LCEntry> next;
-
- FilterLCEntry(std::unique_ptr<LCEntry> _next) : next(std::move(_next)) {}
- virtual ~FilterLCEntry() = default;
-
- virtual std::string& get_bucket() override { return next->get_bucket(); }
- virtual void set_bucket(const std::string& b) override { next->set_bucket(b); }
- virtual std::string& get_oid() override { return next->get_oid(); }
- virtual void set_oid(const std::string& o) override { next->set_oid(o); }
- virtual uint64_t get_start_time() override { return next->get_start_time(); }
- virtual void set_start_time(uint64_t t) override { next->set_start_time(t); }
- virtual uint32_t get_status() override { return next->get_status(); }
- virtual void set_status(uint32_t s) override { next->set_status(s); }
- virtual void print(std::ostream& out) const override { return next->print(out); }
- };
FilterLifecycle(std::unique_ptr<Lifecycle> _next) : next(std::move(_next)) {}
virtual ~FilterLifecycle() = default;
- virtual std::unique_ptr<LCEntry> get_entry() override;
virtual int get_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) override;
+ LCEntry& entry) override;
virtual int get_next_entry(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
- std::unique_ptr<LCEntry>* entry) override;
+ LCEntry& entry) override;
virtual int set_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) override;
+ const std::string& oid, const LCEntry& entry) override;
virtual int list_entries(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, const std::string& marker,
uint32_t max_entries,
- std::vector<std::unique_ptr<LCEntry>>& entries) override;
+ std::vector<LCEntry>& entries) override;
virtual int rm_entry(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, LCEntry& entry) override;
+ const std::string& oid, const LCEntry& entry) override;
virtual int get_head(const DoutPrefixProvider* dpp, optional_yield y,
- const std::string& oid, std::unique_ptr<LCHead>* head) override;
- virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
const std::string& oid, LCHead& head) override;
+ virtual int put_head(const DoutPrefixProvider* dpp, optional_yield y,
+ const std::string& oid, const LCHead& head) override;
virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
const std::string& oid,
const std::string& cookie) override;
diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h
index 6084612d127..d9b2f80e1b6 100644
--- a/src/rgw/rgw_sal_store.h
+++ b/src/rgw/rgw_sal_store.h
@@ -365,6 +365,8 @@ class StoreObject : public Object {
return -ENOENT;
}
+ virtual RGWObjVersionTracker& get_version_tracker() override { return state.objv_tracker; }
+
virtual void print(std::ostream& out) const override {
if (bucket)
out << bucket << ":";
@@ -428,74 +430,6 @@ public:
virtual void print(std::ostream& out) const override { out << oid; }
};
-class StoreLifecycle : public Lifecycle {
-public:
- struct StoreLCHead : LCHead {
- time_t start_date{0};
- time_t shard_rollover_date{0};
- std::string marker;
-
- StoreLCHead() = default;
- StoreLCHead(time_t _start_date, time_t _rollover_date, std::string& _marker) : start_date(_start_date), shard_rollover_date(_rollover_date), marker(_marker) {}
-
- StoreLCHead& operator=(LCHead& _h) {
- start_date = _h.get_start_date();
- shard_rollover_date = _h.get_shard_rollover_date();
- marker = _h.get_marker();
-
- return *this;
- }
-
- virtual time_t& get_start_date() override { return start_date; }
- virtual void set_start_date(time_t _date) override { start_date = _date; }
- virtual std::string& get_marker() override { return marker; }
- virtual void set_marker(const std::string& _marker) override { marker = _marker; }
- virtual time_t& get_shard_rollover_date() override { return shard_rollover_date; }
- virtual void set_shard_rollover_date(time_t _date) override { shard_rollover_date = _date; }
- };
-
- struct StoreLCEntry : LCEntry {
- std::string bucket;
- std::string oid;
- uint64_t start_time{0};
- uint32_t status{0};
-
- StoreLCEntry() = default;
- StoreLCEntry(std::string& _bucket, uint64_t _time, uint32_t _status) : bucket(_bucket), start_time(_time), status(_status) {}
- StoreLCEntry(std::string& _bucket, std::string _oid, uint64_t _time, uint32_t _status) : bucket(_bucket), oid(_oid), start_time(_time), status(_status) {}
- StoreLCEntry(const StoreLCEntry& _e) = default;
-
- StoreLCEntry& operator=(LCEntry& _e) {
- bucket = _e.get_bucket();
- oid = _e.get_oid();
- start_time = _e.get_start_time();
- status = _e.get_status();
-
- return *this;
- }
-
- virtual std::string& get_bucket() override { return bucket; }
- virtual void set_bucket(const std::string& _bucket) override { bucket = _bucket; }
- virtual std::string& get_oid() override { return oid; }
- virtual void set_oid(const std::string& _oid) override { oid = _oid; }
- virtual uint64_t get_start_time() override { return start_time; }
- virtual void set_start_time(uint64_t _time) override { start_time = _time; }
- virtual uint32_t get_status() override { return status; }
- virtual void set_status(uint32_t _status) override { status = _status; }
- virtual void print(std::ostream& out) const override {
- out << bucket << ":" << oid << ":" << start_time << ":" << status;
- }
- };
-
- StoreLifecycle() = default;
- virtual ~StoreLifecycle() = default;
-
- virtual std::unique_ptr<LCEntry> get_entry() override {
- return std::make_unique<StoreLCEntry>();
- }
- using Lifecycle::get_entry;
-};
-
class StoreNotification : public Notification {
protected:
Object* obj;
diff --git a/src/seastar b/src/seastar
-Subproject ba30b8c4d33ff22deff8daee5b080aaa7a7b402
+Subproject 7d4ae901b5604cb23273a5f0341c8d5f2c553d4
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index ff25b5973a4..52d8d4425fa 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -178,7 +178,8 @@
usage: rbd bench [--pool <pool>] [--namespace <namespace>] [--image <image>]
[--io-size <io-size>] [--io-threads <io-threads>]
[--io-total <io-total>] [--io-pattern <io-pattern>]
- [--rw-mix-read <rw-mix-read>] --io-type <io-type>
+ [--rw-mix-read <rw-mix-read>]
+ [--pattern-byte <pattern-byte>] --io-type <io-type>
<image-spec>
Simple benchmark.
@@ -196,6 +197,8 @@
--io-total arg total size for IO (in B/K/M/G/T) [default: 1G]
--io-pattern arg IO pattern (rand, seq, or full-seq) [default: seq]
--rw-mix-read arg read proportion in readwrite (<= 100) [default: 50]
+ --pattern-byte arg which byte value to write (integer between 0-255, rand
+ or rand-str [default: rand]
--io-type arg IO type (read, write, or readwrite(rw))
rbd help children
diff --git a/src/test/crimson/test_calc_subsets.cc b/src/test/crimson/test_calc_subsets.cc
index 7d23810c2e6..dae3cab84fe 100644
--- a/src/test/crimson/test_calc_subsets.cc
+++ b/src/test/crimson/test_calc_subsets.cc
@@ -190,6 +190,7 @@ TEST(clone_subsets, overlap)
ss.clones.push_back(snapid_t(2));
pg_missing_t missing;
pg_missing_item item;
+ item.clean_regions.mark_fully_dirty();
missing.add(clone, std::move(item));
hobject_t last_backfill{object_t{"foo1"}, "foo1", CEPH_NOSNAP, 42, 0, "nspace"};
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index 238cffa1999..507bd6d1b26 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -603,6 +603,13 @@ int IoCtx::omap_get_vals(const std::string& oid,
max_return, out_vals));
}
+int IoCtx::omap_rm_keys(const std::string& oid,
+ const std::set<std::string>& keys) {
+ TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
+ return ctx->execute_operation(
+ oid, std::bind(&TestIoCtxImpl::omap_rm_keys, _1, _2, keys));
+}
+
int IoCtx::operate(const std::string& oid, ObjectWriteOperation *op) {
TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
TestObjectOperationImpl *ops = reinterpret_cast<TestObjectOperationImpl*>(op->impl);
diff --git a/src/test/librbd/CMakeLists.txt b/src/test/librbd/CMakeLists.txt
index c3f0edbea5d..6b3d8c2681c 100644
--- a/src/test/librbd/CMakeLists.txt
+++ b/src/test/librbd/CMakeLists.txt
@@ -153,6 +153,11 @@ if(LINUX AND HAVE_LIBCRYPTSETUP)
crypto/luks/test_mock_LoadRequest.cc)
endif()
+if(HAVE_LIBNBD)
+ list(APPEND unittest_librbd_srcs
+ migration/test_mock_NBDStream.cc)
+endif()
+
# On Windows, we'll skip librbd unit tests for the time being, running just the
# functional tests. The reason is that the unit tests require libcls*, which in
# turn requires libos and libosd, however those libraries haven't been ported to
diff --git a/src/test/librbd/migration/test_mock_FileStream.cc b/src/test/librbd/migration/test_mock_FileStream.cc
index a5bdfebe4b8..b9729edf69a 100644
--- a/src/test/librbd/migration/test_mock_FileStream.cc
+++ b/src/test/librbd/migration/test_mock_FileStream.cc
@@ -209,5 +209,33 @@ TEST_F(TestMockMigrationFileStream, ShortReadError) {
ASSERT_EQ(0, ctx3.wait());
}
+TEST_F(TestMockMigrationFileStream, ListSparseExtents) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ bufferlist bl;
+ ASSERT_EQ(0, bl.write_file(file_name.c_str()));
+
+ MockFileStream mock_file_stream(&mock_image_ctx, json_object);
+
+ C_SaferCond ctx1;
+ mock_file_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_file_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+ &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+ expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_file_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
} // namespace migration
} // namespace librbd
diff --git a/src/test/librbd/migration/test_mock_HttpStream.cc b/src/test/librbd/migration/test_mock_HttpStream.cc
index aff22b757e9..f928d9351ce 100644
--- a/src/test/librbd/migration/test_mock_HttpStream.cc
+++ b/src/test/librbd/migration/test_mock_HttpStream.cc
@@ -190,5 +190,36 @@ TEST_F(TestMockMigrationHttpStream, Read) {
ASSERT_EQ(0, ctx3.wait());
}
+TEST_F(TestMockMigrationHttpStream, ListSparseExtents) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_http_client = new MockHttpClient();
+ expect_open(*mock_http_client, 0);
+ expect_close(*mock_http_client, 0);
+
+ MockHttpStream mock_http_stream(&mock_image_ctx, json_object);
+
+ C_SaferCond ctx1;
+ mock_http_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_http_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+ &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+ expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_http_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
} // namespace migration
} // namespace librbd
diff --git a/src/test/librbd/migration/test_mock_NBDStream.cc b/src/test/librbd/migration/test_mock_NBDStream.cc
new file mode 100644
index 00000000000..5977057b11f
--- /dev/null
+++ b/src/test/librbd/migration/test_mock_NBDStream.cc
@@ -0,0 +1,730 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_mock_fixture.h"
+#include "test/librbd/test_support.h"
+#include "include/rbd_types.h"
+#include "librbd/migration/NBDStream.h"
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+#include "json_spirit/json_spirit.h"
+
+namespace librbd {
+namespace {
+
+struct MockTestImageCtx : public MockImageCtx {
+ MockTestImageCtx(ImageCtx &image_ctx) : MockImageCtx(image_ctx) {
+ }
+};
+
+} // anonymous namespace
+} // namespace librbd
+
+#include "librbd/migration/NBDStream.cc"
+
+namespace librbd {
+namespace migration {
+
+template <>
+struct NBDClient<MockTestImageCtx> {
+ static NBDClient* s_instance;
+ static NBDClient* create() {
+ ceph_assert(s_instance != nullptr);
+ return s_instance;
+ }
+
+ NBDClient() {
+ s_instance = this;
+ }
+
+ MOCK_METHOD0(get_error, const char*());
+ MOCK_METHOD0(get_errno, int());
+ MOCK_METHOD0(init, int());
+ MOCK_METHOD1(add_meta_context, int(const char*));
+ MOCK_METHOD1(connect_uri, int(const char*));
+ MOCK_METHOD0(get_size, int64_t());
+ MOCK_METHOD4(pread, int(void*, size_t, uint64_t, uint32_t));
+ MOCK_METHOD4(block_status, int(uint64_t, uint64_t, nbd_extent_callback,
+ uint32_t));
+ MOCK_METHOD1(shutdown, int(uint32_t));
+};
+
+NBDClient<MockTestImageCtx>* NBDClient<MockTestImageCtx>::s_instance = nullptr;
+
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::InSequence;
+using ::testing::Return;
+using ::testing::WithArg;
+
+class TestMockMigrationNBDStream : public TestMockFixture {
+public:
+ typedef NBDStream<MockTestImageCtx> MockNBDStream;
+ typedef NBDClient<MockTestImageCtx> MockNBDClient;
+
+ void SetUp() override {
+ TestMockFixture::SetUp();
+
+ ASSERT_EQ(0, open_image(m_image_name, &m_image_ctx));
+ m_json_object["uri"] = "nbd://foo.example";
+ }
+
+ void expect_get_errno(MockNBDClient& mock_nbd_client, int err) {
+ EXPECT_CALL(mock_nbd_client, get_errno()).WillOnce(Return(err));
+ EXPECT_CALL(mock_nbd_client, get_error()).WillOnce(Return("error message"));
+ }
+
+ void expect_init(MockNBDClient& mock_nbd_client, int rc) {
+ EXPECT_CALL(mock_nbd_client, init()).WillOnce(Return(rc));
+ }
+
+ void expect_add_meta_context(MockNBDClient& mock_nbd_client, int rc) {
+ EXPECT_CALL(mock_nbd_client, add_meta_context(_)).WillOnce(Return(rc));
+ }
+
+ void expect_connect_uri(MockNBDClient& mock_nbd_client, int rc) {
+ EXPECT_CALL(mock_nbd_client, connect_uri(_)).WillOnce(Return(rc));
+ }
+
+ void expect_get_size(MockNBDClient& mock_nbd_client, int64_t rc) {
+ EXPECT_CALL(mock_nbd_client, get_size()).WillOnce(Return(rc));
+ }
+
+ void expect_pread(MockNBDClient& mock_nbd_client, uint64_t byte_offset,
+ uint64_t byte_length, const void* buf, int rc) {
+ EXPECT_CALL(mock_nbd_client, pread(_, byte_length, byte_offset, _))
+ .WillOnce(WithArg<0>(Invoke(
+ [byte_length, buf, rc](void* out_buf) {
+ memcpy(out_buf, buf, byte_length);
+ return rc;
+ })));
+ }
+
+ struct block_status_cb_args {
+ const char* metacontext;
+ uint64_t entries_offset;
+ std::vector<uint32_t> entries;
+ };
+
+ // cbs is taken by non-const reference only because of
+ // nbd_extent_callback::callback() signature
+ void expect_block_status(MockNBDClient& mock_nbd_client,
+ uint64_t byte_offset, uint64_t byte_length,
+ std::vector<block_status_cb_args>& cbs, int rc) {
+ EXPECT_CALL(mock_nbd_client, block_status(byte_length, byte_offset, _, _))
+ .WillOnce(WithArg<2>(Invoke(
+ [&cbs, rc](nbd_extent_callback extent_callback) {
+ int err = 0;
+ for (auto& cb : cbs) {
+ extent_callback.callback(extent_callback.user_data, cb.metacontext,
+ cb.entries_offset, cb.entries.data(),
+ cb.entries.size(), &err);
+ }
+ return rc;
+ })));
+ }
+
+ void expect_shutdown(MockNBDClient& mock_nbd_client, int rc) {
+ EXPECT_CALL(mock_nbd_client, shutdown(_)).WillOnce(Return(rc));
+ }
+
+ librbd::ImageCtx *m_image_ctx;
+ json_spirit::mObject m_json_object;
+};
+
+TEST_F(TestMockMigrationNBDStream, OpenInvalidURI) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ m_json_object["uri"] = 123;
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(-EINVAL, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenMissingURI) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ m_json_object.clear();
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(-EINVAL, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenInitError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, -1);
+ expect_get_errno(*mock_nbd_client, ENOMEM);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(-ENOMEM, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenAddMetaContextError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, -1);
+ expect_get_errno(*mock_nbd_client, EINVAL);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(-EINVAL, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenConnectURIError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, -1);
+ expect_get_errno(*mock_nbd_client, ECONNREFUSED);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(-ECONNREFUSED, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, OpenConnectURIErrorNoErrno) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, -1);
+ // libnbd actually does this for getaddrinfo() errors ("Name or
+ // service not known", etc)
+ expect_get_errno(*mock_nbd_client, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(-EIO, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, GetSize) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ expect_get_size(*mock_nbd_client, 128);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ uint64_t size;
+ mock_nbd_stream.get_size(&size, &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+ ASSERT_EQ(128, size);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, GetSizeError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ expect_get_size(*mock_nbd_client, -1);
+ expect_get_errno(*mock_nbd_client, EOVERFLOW);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ uint64_t size;
+ mock_nbd_stream.get_size(&size, &ctx2);
+ ASSERT_EQ(-EOVERFLOW, ctx2.wait());
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, Read) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ std::string s1(128, '1');
+ expect_pread(*mock_nbd_client, 0, 128, s1.c_str(), 0);
+ std::string s2(64, '2');
+ expect_pread(*mock_nbd_client, 256, 64, s2.c_str(), 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ bufferlist bl;
+ mock_nbd_stream.read({{0, 128}, {256, 64}}, &bl, &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ bufferlist expected_bl;
+ expected_bl.append(s1);
+ expected_bl.append(s2);
+ ASSERT_EQ(expected_bl, bl);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ReadError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ std::string s1(128, '1');
+ expect_pread(*mock_nbd_client, 0, 128, s1.c_str(), -1);
+ expect_get_errno(*mock_nbd_client, ERANGE);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ bufferlist bl;
+ mock_nbd_stream.read({{0, 128}, {256, 64}}, &bl, &ctx2);
+ ASSERT_EQ(-ERANGE, ctx2.wait());
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtents) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ // DATA
+ std::vector<block_status_cb_args> cbs1 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {128, 0}}
+ };
+ expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+ // ZEROED (zero)
+ std::vector<block_status_cb_args> cbs2 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 256, {64, LIBNBD_STATE_ZERO}}
+ };
+ expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+ // ZEROED (hole)
+ std::vector<block_status_cb_args> cbs3 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 352, {32, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 352, 32, cbs3, 0);
+ // ZEROED, DATA
+ std::vector<block_status_cb_args> cbs4 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 384,
+ {56, LIBNBD_STATE_ZERO, 8, LIBNBD_STATE_HOLE, 16, 0}}
+ };
+ expect_block_status(*mock_nbd_client, 384, 80, cbs4, 0);
+ // DATA, ZEROED
+ std::vector<block_status_cb_args> cbs5 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 464,
+ {40, 0, 16, LIBNBD_STATE_HOLE, 8, LIBNBD_STATE_ZERO}}
+ };
+ expect_block_status(*mock_nbd_client, 464, 64, cbs5, 0);
+ // ZEROED, DATA, ZEROED
+ std::vector<block_status_cb_args> cbs6 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 528,
+ {80, LIBNBD_STATE_HOLE, 128, 0, 32, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 528, 240, cbs6, 0);
+ // DATA, ZEROED, DATA
+ std::vector<block_status_cb_args> cbs7 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 1536,
+ {48, 0, 256, LIBNBD_STATE_ZERO, 16, 0}}
+ };
+ expect_block_status(*mock_nbd_client, 1536, 320, cbs7, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}, {352, 32},
+ {384, 80}, {464, 64}, {528, 240},
+ {1536, 320}}, &sparse_extents, &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+ expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_ZEROED, 64});
+ expected_sparse_extents.insert(352, 96, {io::SPARSE_EXTENT_STATE_ZEROED, 96});
+ expected_sparse_extents.insert(448, 56, {io::SPARSE_EXTENT_STATE_DATA, 56});
+ expected_sparse_extents.insert(504, 104, {io::SPARSE_EXTENT_STATE_ZEROED, 104});
+ expected_sparse_extents.insert(608, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+ expected_sparse_extents.insert(736, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+ expected_sparse_extents.insert(1536, 48, {io::SPARSE_EXTENT_STATE_DATA, 48});
+ expected_sparse_extents.insert(1584, 256, {io::SPARSE_EXTENT_STATE_ZEROED, 256});
+ expected_sparse_extents.insert(1840, 16, {io::SPARSE_EXTENT_STATE_DATA, 16});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsMoreThanRequested) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ // extra byte at the end
+ std::vector<block_status_cb_args> cbs1 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {129, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+ // extra byte at the start
+ std::vector<block_status_cb_args> cbs2 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 255, {65, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+ // extra byte on both sides
+ std::vector<block_status_cb_args> cbs3 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 351, {34, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 352, 32, cbs3, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}, {352, 32}},
+ &sparse_extents, &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_ZEROED, 128});
+ expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_ZEROED, 64});
+ expected_sparse_extents.insert(352, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsLessThanRequested) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ // missing byte at the end
+ std::vector<block_status_cb_args> cbs1 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {127, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+ // missing byte at the start
+ std::vector<block_status_cb_args> cbs2 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 257, {63, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+ // missing byte on both sides
+ std::vector<block_status_cb_args> cbs3 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 353, {30, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 352, 32, cbs3, 0);
+ // zero-sized entry
+ std::vector<block_status_cb_args> cbs4 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 400, {0, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 400, 48, cbs4, 0);
+ // no entries
+ std::vector<block_status_cb_args> cbs5 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 520, {}}
+ };
+ expect_block_status(*mock_nbd_client, 520, 16, cbs5, 0);
+ // no callback
+ std::vector<block_status_cb_args> cbs6;
+ expect_block_status(*mock_nbd_client, 608, 8, cbs6, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}, {352, 32},
+ {400, 48}, {520, 16}, {608, 8}},
+ &sparse_extents, &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 127, {io::SPARSE_EXTENT_STATE_ZEROED, 127});
+ expected_sparse_extents.insert(127, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+ expected_sparse_extents.insert(256, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+ expected_sparse_extents.insert(257, 63, {io::SPARSE_EXTENT_STATE_ZEROED, 63});
+ expected_sparse_extents.insert(352, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+ expected_sparse_extents.insert(353, 30, {io::SPARSE_EXTENT_STATE_ZEROED, 30});
+ expected_sparse_extents.insert(383, 1, {io::SPARSE_EXTENT_STATE_DATA, 1});
+ expected_sparse_extents.insert(400, 48, {io::SPARSE_EXTENT_STATE_DATA, 48});
+ expected_sparse_extents.insert(520, 16, {io::SPARSE_EXTENT_STATE_DATA, 16});
+ expected_sparse_extents.insert(608, 8, {io::SPARSE_EXTENT_STATE_DATA, 8});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsMultipleCallbacks) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ std::vector<block_status_cb_args> cbs1 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 96, {32, LIBNBD_STATE_HOLE}},
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 32, {32, LIBNBD_STATE_ZERO}},
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {32, LIBNBD_STATE_ZERO}},
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 64, {32, LIBNBD_STATE_HOLE}}
+ };
+ expect_block_status(*mock_nbd_client, 0, 128, cbs1, 0);
+ std::vector<block_status_cb_args> cbs2 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 192, {32, 0}},
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 128, {32, LIBNBD_STATE_ZERO, 32, 0}},
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 224, {32, LIBNBD_STATE_ZERO}}
+ };
+ expect_block_status(*mock_nbd_client, 128, 128, cbs2, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_nbd_stream.list_sparse_extents({{0, 128}, {128, 128}}, &sparse_extents,
+ &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 160, {io::SPARSE_EXTENT_STATE_ZEROED, 160});
+ expected_sparse_extents.insert(160, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+ expected_sparse_extents.insert(224, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsUnexpectedMetaContexts) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ std::vector<block_status_cb_args> cbs = {
+ {"unexpected context 1", 0, {64, LIBNBD_STATE_ZERO, 64, 0}},
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 0, {32, LIBNBD_STATE_ZERO, 96, 0}},
+ {"unexpected context 2", 0, {128, LIBNBD_STATE_ZERO}}
+ };
+ expect_block_status(*mock_nbd_client, 0, 128, cbs, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_nbd_stream.list_sparse_extents({{0, 128}}, &sparse_extents, &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 32, {io::SPARSE_EXTENT_STATE_ZEROED, 32});
+ expected_sparse_extents.insert(32, 96, {io::SPARSE_EXTENT_STATE_DATA, 96});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ListSparseExtentsError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ // error isn't propagated -- DATA is assumed instead
+ std::vector<block_status_cb_args> cbs1;
+ expect_block_status(*mock_nbd_client, 0, 128, cbs1, -1);
+ expect_get_errno(*mock_nbd_client, ENOTSUP);
+ std::vector<block_status_cb_args> cbs2 = {
+ {LIBNBD_CONTEXT_BASE_ALLOCATION, 256, {64, LIBNBD_STATE_ZERO}}
+ };
+ expect_block_status(*mock_nbd_client, 256, 64, cbs2, 0);
+ expect_shutdown(*mock_nbd_client, 0);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_nbd_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+ &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+ expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_ZEROED, 64});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_nbd_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
+TEST_F(TestMockMigrationNBDStream, ShutdownError) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_nbd_client = new MockNBDClient();
+ expect_init(*mock_nbd_client, 0);
+ expect_add_meta_context(*mock_nbd_client, 0);
+ expect_connect_uri(*mock_nbd_client, 0);
+ // error is ignored
+ expect_shutdown(*mock_nbd_client, -1);
+
+ MockNBDStream mock_nbd_stream(&mock_image_ctx, m_json_object);
+
+ C_SaferCond ctx1;
+ mock_nbd_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ mock_nbd_stream.close(&ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+}
+
+} // namespace migration
+} // namespace librbd
diff --git a/src/test/librbd/migration/test_mock_RawSnapshot.cc b/src/test/librbd/migration/test_mock_RawSnapshot.cc
index 3ce4b5c9daa..c9af8933652 100644
--- a/src/test/librbd/migration/test_mock_RawSnapshot.cc
+++ b/src/test/librbd/migration/test_mock_RawSnapshot.cc
@@ -105,6 +105,19 @@ public:
})));
}
+ void expect_stream_list_sparse_extents(MockStreamInterface& mock_stream_interface,
+ const io::Extents& byte_extents,
+ const io::SparseExtents& sparse_extents,
+ int r) {
+ EXPECT_CALL(mock_stream_interface, list_sparse_extents(byte_extents, _, _))
+ .WillOnce(WithArgs<1, 2>(Invoke(
+ [sparse_extents, r](io::SparseExtents* out_sparse_extents,
+ Context* ctx) {
+ out_sparse_extents->insert(sparse_extents);
+ ctx->complete(r);
+ })));
+ }
+
json_spirit::mObject json_object;
};
@@ -232,6 +245,11 @@ TEST_F(TestMockMigrationRawSnapshot, ListSnap) {
expect_stream_open(*mock_stream_interface, 0);
expect_stream_get_size(*mock_stream_interface, 0, 0);
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 123, {io::SPARSE_EXTENT_STATE_DATA, 123});
+ expect_stream_list_sparse_extents(*mock_stream_interface, {{0, 123}},
+ expected_sparse_extents, 0);
+
expect_stream_close(*mock_stream_interface, 0);
MockRawSnapshot mock_raw_snapshot(&mock_image_ctx, json_object,
@@ -245,6 +263,7 @@ TEST_F(TestMockMigrationRawSnapshot, ListSnap) {
io::SparseExtents sparse_extents;
mock_raw_snapshot.list_snap({{0, 123}}, 0, &sparse_extents, {}, &ctx2);
ASSERT_EQ(0, ctx2.wait());
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
C_SaferCond ctx3;
mock_raw_snapshot.close(&ctx3);
diff --git a/src/test/librbd/migration/test_mock_S3Stream.cc b/src/test/librbd/migration/test_mock_S3Stream.cc
index 2f2097f7926..272ed928965 100644
--- a/src/test/librbd/migration/test_mock_S3Stream.cc
+++ b/src/test/librbd/migration/test_mock_S3Stream.cc
@@ -234,5 +234,36 @@ TEST_F(TestMockMigrationS3Stream, ProcessRequest) {
ASSERT_EQ(0, ctx2.wait());
}
+TEST_F(TestMockMigrationS3Stream, ListSparseExtents) {
+ MockTestImageCtx mock_image_ctx(*m_image_ctx);
+
+ InSequence seq;
+
+ auto mock_http_client = new MockHttpClient();
+ expect_open(*mock_http_client, 0);
+ expect_close(*mock_http_client, 0);
+
+ MockS3Stream mock_s3_stream(&mock_image_ctx, json_object);
+
+ C_SaferCond ctx1;
+ mock_s3_stream.open(&ctx1);
+ ASSERT_EQ(0, ctx1.wait());
+
+ C_SaferCond ctx2;
+ io::SparseExtents sparse_extents;
+ mock_s3_stream.list_sparse_extents({{0, 128}, {256, 64}}, &sparse_extents,
+ &ctx2);
+ ASSERT_EQ(0, ctx2.wait());
+
+ io::SparseExtents expected_sparse_extents;
+ expected_sparse_extents.insert(0, 128, {io::SPARSE_EXTENT_STATE_DATA, 128});
+ expected_sparse_extents.insert(256, 64, {io::SPARSE_EXTENT_STATE_DATA, 64});
+ ASSERT_EQ(expected_sparse_extents, sparse_extents);
+
+ C_SaferCond ctx3;
+ mock_s3_stream.close(&ctx3);
+ ASSERT_EQ(0, ctx3.wait());
+}
+
} // namespace migration
} // namespace librbd
diff --git a/src/test/librbd/mock/migration/MockStreamInterface.h b/src/test/librbd/mock/migration/MockStreamInterface.h
index 36df86638de..1f33fee5438 100644
--- a/src/test/librbd/mock/migration/MockStreamInterface.h
+++ b/src/test/librbd/mock/migration/MockStreamInterface.h
@@ -21,6 +21,14 @@ struct MockStreamInterface : public StreamInterface {
void read(io::Extents&& byte_extents, bufferlist* bl, Context* on_finish) {
read(byte_extents, bl, on_finish);
}
+
+ MOCK_METHOD3(list_sparse_extents, void(const io::Extents&,
+ io::SparseExtents*, Context*));
+ void list_sparse_extents(io::Extents&& byte_extents,
+ io::SparseExtents* sparse_extents,
+ Context* on_finish) {
+ list_sparse_extents(byte_extents, sparse_extents, on_finish);
+ }
};
} // namespace migration
diff --git a/src/test/librbd/test_Groups.cc b/src/test/librbd/test_Groups.cc
index eaef20611ac..eb18473b988 100644
--- a/src/test/librbd/test_Groups.cc
+++ b/src/test/librbd/test_Groups.cc
@@ -5,12 +5,15 @@
#include "test/librbd/test_support.h"
#include "include/rbd/librbd.h"
#include "include/rbd/librbd.hpp"
+#include "librbd/api/Group.h"
#include "test/librados/test.h"
#include "gtest/gtest.h"
#include <boost/scope_exit.hpp>
#include <chrono>
#include <vector>
+#include <set>
+#include <algorithm>
void register_test_groups() {
}
@@ -772,3 +775,102 @@ TEST_F(TestGroup, snap_list2PP)
ASSERT_EQ(0, m_rbd.group_remove(m_ioctx, gp_name));
ASSERT_EQ(0, _rados.pool_delete(pool_name2.c_str()));
}
+
+TEST_F(TestGroup, snap_list_internal)
+{
+ REQUIRE_FORMAT_V2();
+
+ // Check that the listing works with different
+ // values for try_to_sort and fail_if_not_sorted
+
+ librados::IoCtx ioctx;
+ ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+ const char *group_name = "gp_snaplist_internalPP";
+
+ librbd::RBD rbd;
+ ASSERT_EQ(0, rbd.group_create(ioctx, group_name));
+
+ std::vector<librbd::group_snap_info2_t> gp_snaps;
+
+ // No snaps present
+ ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, true, true,
+ &gp_snaps));
+ ASSERT_EQ(0U, gp_snaps.size());
+
+ ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, false, false,
+ &gp_snaps));
+ ASSERT_EQ(0U, gp_snaps.size());
+
+ // Create a stale snap_order key by deleting the snapshot_ key
+ ASSERT_EQ(0, librbd::api::Group<>::snap_create(ioctx, group_name,
+ "test-snap", 0));
+ ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, false, false,
+ &gp_snaps));
+ ASSERT_EQ(1U, gp_snaps.size());
+
+ std::string group_id;
+ ASSERT_EQ(0, librbd::api::Group<>::get_id(ioctx, group_name, &group_id));
+
+ std::string group_header = RBD_GROUP_HEADER_PREFIX + group_id;
+ std::set<std::string> keys = {"snapshot_" + gp_snaps[0].id};
+ ASSERT_EQ(0, ioctx.omap_rm_keys(group_header, keys));
+
+ for (int i = 0; i < 20; i++) {
+ std::string name = "snap" + stringify(i);
+ ASSERT_EQ(0, librbd::api::Group<>::snap_create(ioctx, group_name,
+ name.c_str(), 0));
+ }
+
+ ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, true, true,
+ &gp_snaps));
+ ASSERT_EQ(20U, gp_snaps.size());
+
+ // Verify that the sorted list is correct
+ for (size_t i = 0; i < gp_snaps.size(); i++){
+ std::string name = "snap" + stringify(i);
+ ASSERT_EQ(name, gp_snaps[i].name);
+ }
+
+ // Sort on group snap ids to simulate the unsorted list.
+ std::vector<librbd::group_snap_info2_t> snaps_sorted_by_id = gp_snaps;
+ std::sort(snaps_sorted_by_id.begin(), snaps_sorted_by_id.end(),
+ [](const librbd::group_snap_info2_t &a,
+ const librbd::group_snap_info2_t &b) {
+ return a.id < b.id;
+ });
+
+ // Check that the vectors actually differ
+ bool differ = false;
+ for (size_t i = 0; i < gp_snaps.size(); i++) {
+ if (gp_snaps[i].id != snaps_sorted_by_id[i].id) {
+ differ = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(differ);
+
+ // Remove the snap_order key for one of the snaps.
+ keys = {"snap_order_" + gp_snaps[1].id};
+ ASSERT_EQ(0, ioctx.omap_rm_keys(group_header, keys));
+
+ //This should fail.
+ ASSERT_EQ(-EINVAL, librbd::api::Group<>::snap_list(ioctx, group_name, true,
+ true, &gp_snaps));
+
+ // Should work if fail_if_not_sorted is false
+ ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, true, false,
+ &gp_snaps));
+ ASSERT_EQ(20U, gp_snaps.size());
+
+ ASSERT_EQ(0, librbd::api::Group<>::snap_list(ioctx, group_name, false, false,
+ &gp_snaps));
+ ASSERT_EQ(20U, gp_snaps.size());
+
+ //Compare unsorted listing
+ for (size_t i = 0; i < gp_snaps.size(); i++){
+ ASSERT_EQ(snaps_sorted_by_id[i].id, gp_snaps[i].id);
+ }
+
+ ASSERT_EQ(0, rbd.group_remove(ioctx, group_name));
+}
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index 5b20db909ce..38b62f3ea6d 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -4364,7 +4364,7 @@ public:
gen_type *rng;
ObjectStore *store;
ObjectStore::CollectionHandle ch;
-
+ std::vector<ObjectStore::Transaction> tt;
ceph::mutex lock = ceph::make_mutex("State lock");
ceph::condition_variable cond;
@@ -4378,6 +4378,24 @@ public:
}
};
+ int queue_transactions()
+ {
+ int result = 0;
+ if (!tt.empty()) {
+ result = store->queue_transactions(ch, tt);
+ tt.clear();
+ }
+ return result;
+ }
+ int maybe_queue_transactions()
+ {
+ int result = 0;
+ if ((rand() % 7) == 0 || tt.size() > 10) {
+ result = store->queue_transactions(ch, tt);
+ tt.clear();
+ }
+ return result;
+ }
class C_SyntheticOnReadable : public Context {
public:
SyntheticWorkloadState *state;
@@ -4495,6 +4513,7 @@ public:
}
void shutdown() {
ghobject_t next;
+ queue_transactions();
while (1) {
vector<ghobject_t> objects;
int r = collection_list(store, ch, next, ghobject_t::get_max(), 10,
@@ -4515,6 +4534,7 @@ public:
queue_transaction(store, ch, std::move(t));
}
void statfs(store_statfs_t& stat) {
+ queue_transactions();
store->statfs(&stat);
}
@@ -4550,6 +4570,7 @@ public:
void wait_for_done() {
std::unique_lock locker{lock};
+ queue_transactions();
cond.wait(locker, [this] { return in_flight == 0; });
}
@@ -4623,7 +4644,7 @@ public:
wait_for_ready(locker);
ghobject_t new_obj = object_gen->create_object(rng);
available_objects.erase(new_obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
t.touch(cid, new_obj);
boost::uniform_int<> u(17, 22);
boost::uniform_int<> v(12, 17);
@@ -4636,7 +4657,7 @@ public:
if (!contents.count(new_obj))
contents[new_obj] = Object();
t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -4659,7 +4680,7 @@ public:
new_obj.generation++;
available_objects.erase(new_obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
t.collection_move_rename(cid, old_obj, cid, new_obj);
++in_flight;
in_flight_objects.insert(old_obj);
@@ -4668,7 +4689,7 @@ public:
contents[new_obj].data = contents[old_obj].data;
contents.erase(old_obj);
t.register_on_applied(new C_SyntheticOnStash(this, old_obj, new_obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -4692,7 +4713,7 @@ public:
new_obj.hobj.set_hash(old_obj.hobj.get_hash());
available_objects.erase(new_obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
t.clone(cid, old_obj, new_obj);
++in_flight;
in_flight_objects.insert(old_obj);
@@ -4701,7 +4722,7 @@ public:
contents[new_obj].data = contents[old_obj].data;
t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -4751,7 +4772,7 @@ public:
<< " (size " << srcdata.length() << ") to "
<< dstoff << "~" << len << std::endl;
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
t.clone_range(cid, old_obj, new_obj, srcoff, len, dstoff);
++in_flight;
in_flight_objects.insert(old_obj);
@@ -4783,7 +4804,7 @@ public:
}
t.register_on_applied(new C_SyntheticOnClone(this, old_obj, new_obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -4797,7 +4818,7 @@ public:
ghobject_t new_obj = get_uniform_random_object(locker);
available_objects.erase(new_obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
boost::uniform_int<> u1(0, max_object_len - max_write_len);
boost::uniform_int<> u2(0, max_write_len);
@@ -4832,7 +4853,7 @@ public:
++in_flight;
in_flight_objects.insert(new_obj);
t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -4845,7 +4866,7 @@ public:
ghobject_t obj = get_uniform_random_object(locker);
available_objects.erase(obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
boost::uniform_int<> choose(0, max_object_len);
size_t len = choose(*rng);
@@ -4866,7 +4887,7 @@ public:
}
t.register_on_applied(new C_SyntheticOnReadable(this, obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -4879,7 +4900,7 @@ public:
ghobject_t new_obj = get_uniform_random_object(locker);
available_objects.erase(new_obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
boost::uniform_int<> u1(0, max_object_len - max_write_len);
boost::uniform_int<> u2(0, max_write_len);
@@ -4907,12 +4928,13 @@ public:
++in_flight;
in_flight_objects.insert(new_obj);
t.register_on_applied(new C_SyntheticOnReadable(this, new_obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
void read() {
EnterExit ee("read");
+ queue_transactions();
boost::uniform_int<> u1(0, max_object_len/2);
boost::uniform_int<> u2(0, max_object_len);
uint64_t offset = u1(*rng);
@@ -4962,7 +4984,7 @@ public:
ghobject_t obj = get_uniform_random_object(locker);
available_objects.erase(obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
boost::uniform_int<> u0(1, max_attr_size);
boost::uniform_int<> u1(4, max_attr_name_len);
@@ -4997,7 +5019,7 @@ public:
++in_flight;
in_flight_objects.insert(obj);
t.register_on_applied(new C_SyntheticOnReadable(this, obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -5010,7 +5032,7 @@ public:
ghobject_t obj = get_next_object(locker);
available_objects.erase(obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
map<string, bufferlist, less<>> attrs;
set<string> keys;
@@ -5026,12 +5048,13 @@ public:
++in_flight;
in_flight_objects.insert(obj);
t.register_on_applied(new C_SyntheticOnReadable(this, obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
void getattrs() {
EnterExit ee("getattrs");
+ queue_transactions();
ghobject_t obj;
map<string, bufferlist> expected;
{
@@ -5061,6 +5084,7 @@ public:
void getattr() {
EnterExit ee("getattr");
+ queue_transactions();
ghobject_t obj;
int r;
int retry;
@@ -5118,19 +5142,20 @@ public:
}
available_objects.erase(obj);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
t.rmattr(cid, obj, it->first);
contents[obj].attrs.erase(it->first);
++in_flight;
in_flight_objects.insert(obj);
t.register_on_applied(new C_SyntheticOnReadable(this, obj));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
void fsck(bool deep) {
std::unique_lock locker{lock};
+ queue_transactions();
EnterExit ee("fsck");
cond.wait(locker, [this] { return in_flight == 0; });
ch.reset();
@@ -5144,6 +5169,7 @@ public:
void scan() {
std::unique_lock locker{lock};
EnterExit ee("scan");
+ queue_transactions();
cond.wait(locker, [this] { return in_flight == 0; });
vector<ghobject_t> objects;
set<ghobject_t> objects_set, objects_set2;
@@ -5201,6 +5227,7 @@ public:
void stat() {
EnterExit ee("stat");
+ queue_transactions();
ghobject_t hoid;
uint64_t expected;
{
@@ -5235,14 +5262,14 @@ public:
if (!can_unlink())
return -ENOENT;
ghobject_t to_remove = get_uniform_random_object(locker);
- ObjectStore::Transaction t;
+ ObjectStore::Transaction& t = tt.emplace_back();
t.remove(cid, to_remove);
++in_flight;
available_objects.erase(to_remove);
in_flight_objects.insert(to_remove);
contents.erase(to_remove);
t.register_on_applied(new C_SyntheticOnReadable(this, to_remove));
- int status = store->queue_transaction(ch, std::move(t));
+ int status = maybe_queue_transactions();
return status;
}
@@ -5281,35 +5308,36 @@ void StoreTestBase::doSyntheticTest(
cerr << "Op " << i << std::endl;
test_obj.print_internal_state();
}
- boost::uniform_int<> true_false(0, 999);
+ boost::uniform_int<> true_false(0, 9999);
int val = true_false(rng);
- if (val > 998) {
+ if (val > 9998) {
test_obj.fsck(true);
- } else if (val > 997) {
+ } else if (val > 9997) {
test_obj.fsck(false);
- } else if (val > 970) {
+ } else if (val > 9900) {
test_obj.scan();
- } else if (val > 950) {
+ } else if (val > 9500) {
test_obj.stat();
- } else if (val > 850) {
+ } else if (val > 8500) {
test_obj.zero();
- } else if (val > 800) {
+ } else if (val > 8000) {
test_obj.unlink();
- } else if (val > 550) {
+ } else if (val > 5500) {
test_obj.write();
- } else if (val > 500) {
+ } else if (val > 5000) {
test_obj.clone();
- } else if (val > 450) {
+ } else if (val > 4500) {
test_obj.clone_range();
- } else if (val > 300) {
+ } else if (val > 3000) {
test_obj.stash();
- } else if (val > 100) {
+ } else if (val > 1000) {
test_obj.read();
} else {
test_obj.truncate();
}
}
test_obj.wait_for_done();
+ test_obj.fsck(true);
test_obj.shutdown();
}
@@ -5338,7 +5366,7 @@ void StoreTestBase::doSyntheticLimitedTest(
cerr << "Op " << i << std::endl;
test_obj.print_internal_state();
}
- boost::uniform_int<> true_false(0, 9999 /*999*/);
+ boost::uniform_int<> true_false(0, 99999 /*999*/);
int val = true_false(rng);
auto option = [&](int range) -> bool {
if (val == -1) {
@@ -5352,21 +5380,22 @@ void StoreTestBase::doSyntheticLimitedTest(
return false;
}
};
- if (option(1)) test_obj.fsck(true);
- if (option(1)) test_obj.fsck(false);
- if (option(1)) test_obj.scan();
- if (option(497)) test_obj.stat();
- if (option(1000)) test_obj.zero();
- if (option(1500)) test_obj.read();
- if (option(1500)) test_obj.write();
- if (option(500)) test_obj.truncate();
- if (option(1000)) test_obj.clone_range();
- if (option(1000)) test_obj.stash();
- if (option(1500)) test_obj.unlink();
- if (option(1500)) test_obj.clone();
+ if (option(3)) test_obj.fsck(true);
+ if (option(3)) test_obj.fsck(false);
+ if (option(94)) test_obj.scan();
+ if (option(4900)) test_obj.stat();
+ if (option(10000)) test_obj.zero();
+ if (option(15000)) test_obj.read();
+ if (option(15000)) test_obj.write();
+ if (option(5000)) test_obj.truncate();
+ if (option(10000)) test_obj.clone_range();
+ if (option(10000)) test_obj.stash();
+ if (option(15000)) test_obj.unlink();
+ if (option(15000)) test_obj.clone();
ceph_assert(val == -1);
}
test_obj.wait_for_done();
+ test_obj.fsck(true);
test_obj.shutdown();
}
diff --git a/src/test/rgw/test_rgw_posix_driver.cc b/src/test/rgw/test_rgw_posix_driver.cc
index caf3c41c8d4..5f624d770d1 100644
--- a/src/test/rgw/test_rgw_posix_driver.cc
+++ b/src/test/rgw/test_rgw_posix_driver.cc
@@ -1709,12 +1709,13 @@ public:
off_t ofs{0};
uint64_t accounted_size{0};
std::string tag;
+ rgw::sal::MultipartUpload::prefix_map_t processed_prefixes;
ACLOwner owner;
owner.id = bucket->get_owner();
int ret = upload->complete(env->dpp, null_yield, get_pointer(env->cct), parts,
remove_objs, accounted_size, compressed, cs_info,
- ofs, tag, owner, 0, mp_obj.get());
+ ofs, tag, owner, 0, mp_obj.get(), processed_prefixes);
EXPECT_EQ(ret, 0);
EXPECT_EQ(write_size, ofs);
EXPECT_EQ(write_size, accounted_size);
@@ -2480,6 +2481,7 @@ public:
off_t ofs{0};
uint64_t accounted_size{0};
std::string tag;
+ rgw::sal::MultipartUpload::prefix_map_t processed_prefixes;
ACLOwner owner;
owner.id = bucket->get_owner();
mp_obj->gen_rand_obj_instance_name();
@@ -2489,7 +2491,7 @@ public:
int ret = upload->complete(env->dpp, null_yield, get_pointer(env->cct), parts,
remove_objs, accounted_size, compressed, cs_info,
- ofs, tag, owner, 0, mp_obj.get());
+ ofs, tag, owner, 0, mp_obj.get(), processed_prefixes);
EXPECT_EQ(ret, 0);
EXPECT_EQ(write_size, ofs);
EXPECT_EQ(write_size, accounted_size);
diff --git a/src/tools/rbd/action/Bench.cc b/src/tools/rbd/action/Bench.cc
index 53f2b6410ae..d4c7a037489 100644
--- a/src/tools/rbd/action/Bench.cc
+++ b/src/tools/rbd/action/Bench.cc
@@ -49,9 +49,13 @@ enum io_pattern_t {
IO_PATTERN_FULL_SEQ
};
+const int PATTERN_BYTE_RAND = -1;
+const int PATTERN_BYTE_RAND_STR = -2;
+
struct IOType {};
struct Size {};
struct IOPattern {};
+struct PatternByte {};
void validate(boost::any& v, const std::vector<std::string>& values,
Size *target_type, int) {
@@ -103,6 +107,27 @@ void validate(boost::any& v, const std::vector<std::string>& values,
v = boost::any(io_type);
}
+void validate(boost::any& v, const std::vector<std::string>& values,
+ PatternByte *target_type, int) {
+ po::validators::check_first_occurrence(v);
+ const std::string &s = po::validators::get_single_string(values);
+ if (s == "rand") {
+ v = boost::any(PATTERN_BYTE_RAND);
+ } else if (s == "rand-str") {
+ v = boost::any(PATTERN_BYTE_RAND_STR);
+ } else {
+ try {
+ int pattern_byte = boost::lexical_cast<int>(s);
+ if (pattern_byte >= 0 && pattern_byte <= 255) {
+ v = boost::any(pattern_byte);
+ return;
+ }
+ } catch (const boost::bad_lexical_cast &) {
+ }
+ throw po::validation_error(po::validation_error::invalid_option_value);
+ }
+}
+
} // anonymous namespace
static void rbd_bencher_completion(void *c, void *pc);
@@ -132,8 +157,10 @@ struct rbd_bencher {
io_type_t io_type;
uint64_t io_size;
bufferlist write_bl;
+ int pattern_byte;
- explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size)
+ explicit rbd_bencher(librbd::Image *i, io_type_t io_type, uint64_t io_size,
+ int pattern_byte)
: image(i),
in_flight(0),
io_type(io_type),
@@ -141,7 +168,15 @@ struct rbd_bencher {
{
if (io_type == IO_TYPE_WRITE || io_type == IO_TYPE_RW) {
bufferptr bp(io_size);
- memset(bp.c_str(), rand() & 0xff, io_size);
+ if (pattern_byte == PATTERN_BYTE_RAND) {
+ memset(bp.c_str(), rand() & 0xff, io_size);
+ } else if (pattern_byte == PATTERN_BYTE_RAND_STR) {
+ for (uint64_t i = 0; i < io_size; i++) {
+ bp.c_str()[i] = rand() & 0xff;
+ }
+ } else {
+ memset(bp.c_str(), pattern_byte, io_size);
+ }
write_bl.push_back(bp);
}
}
@@ -212,7 +247,7 @@ bool should_read(uint64_t read_proportion)
int do_bench(librbd::Image& image, io_type_t io_type,
uint64_t io_size, uint64_t io_threads,
uint64_t io_bytes, io_pattern_t io_pattern,
- uint64_t read_proportion)
+ uint64_t read_proportion, int pattern_byte)
{
uint64_t size = 0;
image.size(&size);
@@ -236,7 +271,7 @@ int do_bench(librbd::Image& image, io_type_t io_type,
// seed rand() before constructing rbd_bencher
srand(time(NULL) % (unsigned long) -1);
- rbd_bencher b(&image, io_type, io_size);
+ rbd_bencher b(&image, io_type, io_size, pattern_byte);
std::cout << "bench "
<< " type " << (io_type == IO_TYPE_READ ? "read" :
@@ -443,7 +478,9 @@ void add_bench_common_options(po::options_description *positional,
("io-threads", po::value<uint32_t>(), "ios in flight [default: 16]")
("io-total", po::value<Size>(), "total size for IO (in B/K/M/G/T) [default: 1G]")
("io-pattern", po::value<IOPattern>(), "IO pattern (rand, seq, or full-seq) [default: seq]")
- ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]");
+ ("rw-mix-read", po::value<uint64_t>(), "read proportion in readwrite (<= 100) [default: 50]")
+ ("pattern-byte", po::value<PatternByte>(),
+ "which byte value to write (integer between 0-255, rand or rand-str [default: rand]");
}
void get_arguments_for_write(po::options_description *positional,
@@ -512,6 +549,13 @@ int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) {
bench_pattern = IO_PATTERN_SEQ;
}
+ int pattern_byte;
+ if (vm.count("pattern-byte")) {
+ pattern_byte = vm["pattern-byte"].as<int>();
+ } else {
+ pattern_byte = PATTERN_BYTE_RAND;
+ }
+
uint64_t bench_read_proportion;
if (bench_io_type == IO_TYPE_READ) {
bench_read_proportion = 100;
@@ -545,7 +589,7 @@ int bench_execute(const po::variables_map &vm, io_type_t bench_io_type) {
register_async_signal_handler_oneshot(SIGTERM, handle_signal);
r = do_bench(image, bench_io_type, bench_io_size, bench_io_threads,
- bench_bytes, bench_pattern, bench_read_proportion);
+ bench_bytes, bench_pattern, bench_read_proportion, pattern_byte);
unregister_async_signal_handler(SIGHUP, sighup_handler);
unregister_async_signal_handler(SIGINT, handle_signal);
diff --git a/src/tools/rbd_ggate/ggate_drv.c b/src/tools/rbd_ggate/ggate_drv.c
index 11f6cf0a466..813faf34c57 100644
--- a/src/tools/rbd_ggate/ggate_drv.c
+++ b/src/tools/rbd_ggate/ggate_drv.c
@@ -130,7 +130,7 @@ int ggate_drv_create(char *name, size_t namelen, size_t sectorsize,
drv = calloc(1, sizeof(*drv));
if (drv == NULL) {
- errno = -ENOMEM;
+ errno = ENOMEM;
goto fail_close;
}