diff options
38 files changed, 302 insertions, 89 deletions
diff --git a/.githubmap b/.githubmap index 5265fa59bed..c8ae6e284a2 100644 --- a/.githubmap +++ b/.githubmap @@ -9,6 +9,7 @@ # a2batic Kanika Murarka <kmurarka@redhat.com> aaSharma14 Aashish Sharma <aasharma@redhat.com> +abhishek-kane Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com> aclamk Adam Kupczyk <akupczyk@redhat.com> adamemerson Adam C. Emerson <aemerson@redhat.com> adk3798 Adam King <adking@redhat.com> @@ -13,6 +13,7 @@ Aashish Sharma <aasharma@redhat.com> <66050535+aaSharma14@users.noreply.github.c Aashish Sharma <aasharma@redhat.com> <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com> Aashish Sharma <aasharma@redhat.com> <aashishsharma@fedora.redhat.com> Aashish Sharma <aasharma@redhat.com> <aashishsharma@localhost.localdomain> +Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com> Abhishek Lekshmanan <abhishek.lekshmanan@cern.ch> <abhishek.l@cern.ch> Abhishek Lekshmanan <abhishek@suse.com> <abhishek.lekshmanan@gmail.com> Abhishek Lekshmanan <abhishek@suse.com> <alekshmanan@suse.com> diff --git a/.organizationmap b/.organizationmap index e59e6ae24e1..ac9b0ea70fe 100644 --- a/.organizationmap +++ b/.organizationmap @@ -345,6 +345,7 @@ Huawei <contact@huawei.com> Yehu <yehu5@huawei.com> Huayun <contact@huayun.com> Zheng Yin <zhengyin@huayun.com> Huazhong University of Science and Technology <contact@hust.edu.cn> Luo Runbing <runsisi@hust.edu.cn> HXT Semiconductor <contact@hxt-semitech.org> Jiang Yutang <yutang2.jiang@hxt-semitech.com> +IBM <contact@IBM.com> Abhishek Kane <abhishek.kane@ibm.com> IBM <contact@IBM.com> Adam Kupczyk <akupczyk@ibm.com> IBM <contact@IBM.com> Afreen Misbah <afreen@ibm.com> IBM <contact@IBM.com> Aliaksei Makarau <aliaksei.makarau@ibm.com> diff --git a/.peoplemap b/.peoplemap index 418e8505fb4..ed70830c092 100644 --- a/.peoplemap +++ b/.peoplemap @@ -16,6 +16,7 @@ # # git log --pretty='%aN <%aE>' $range | git -c mailmap.file=.peoplemap check-mailmap --stdin | sort | uniq | sed -e 's/\(.*\) \(<.*\)/\2 \1/' | uniq --skip-field=1 --all-repeated | sed -e 's/\(.*>\) \(.*\)/\2 \1/' # +Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com> Abhishek Lekshmanan <abhishek.lekshmanan@cern.ch> <abhishek@suse.com> Adam Kupczyk <akupczyk@ibm.com> <akupczyk@redhat.com> <akupczyk@mirantis.com> Alexandre Marangone <amarango@redhat.com> Alexandre Marangone <alexandre.marangone@inktank.com> diff --git a/PendingReleaseNotes b/PendingReleaseNotes index b4824a65584..d25acfa9c6d 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -60,6 +60,10 @@ fuse client for `fallocate` for the default case (i.e. mode == 0) since CephFS does not support disk space reservation. The only flags supported are `FALLOC_FL_KEEP_SIZE` and `FALLOC_FL_PUNCH_HOLE`. +* pybind/rados: Fixes WriteOp.zero() in the original reversed order of arguments + `offset` and `length`. When pybind calls WriteOp.zero(), the argument passed + does not match rados_write_op_zero, and offset and length are swapped, which + results in an unexpected response. * The HeadBucket API now reports the `X-RGW-Bytes-Used` and `X-RGW-Object-Count` headers only when the `read-stats` querystring is explicitly included in the diff --git a/container/Containerfile b/container/Containerfile index c954ebed1be..9a5a88e76a1 100644 --- a/container/Containerfile +++ b/container/Containerfile @@ -212,6 +212,7 @@ RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.tx # Set some envs in the container for quickly inspecting details about the build at runtime ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \ CEPH_REF="${CEPH_REF}" \ + CEPH_VERSION="${CEPH_REF}" \ CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \ FROM_IMAGE="${FROM_IMAGE}" diff --git a/debian/control b/debian/control index a7d2dbb4c3a..a8c79f7a731 100644 --- a/debian/control +++ b/debian/control @@ -999,7 +999,8 @@ Depends: librados2 (= ${binary:Version}), liblua5.3-0, ${misc:Depends}, ${shlibs:Depends}, -Suggests: luarocks, +Suggests: liblua5.3-dev, + luarocks, Description: RADOS Gateway client library RADOS is a distributed object store used by the Ceph distributed storage system. This package provides a REST gateway to the diff --git a/doc/cephfs/disaster-recovery-experts.rst b/doc/cephfs/disaster-recovery-experts.rst index 7677b42f47e..b01a3dfde6a 100644 --- a/doc/cephfs/disaster-recovery-experts.rst +++ b/doc/cephfs/disaster-recovery-experts.rst @@ -21,43 +21,46 @@ Advanced: Metadata repair tools Journal export -------------- -Before attempting dangerous operations, make a copy of the journal like so: +Before attempting any dangerous operation, make a copy of the journal by +running the following command: -:: +.. prompt:: bash # - cephfs-journal-tool journal export backup.bin + cephfs-journal-tool journal export backup.bin -Note that this command may not always work if the journal is badly corrupted, -in which case a RADOS-level copy should be made (http://tracker.ceph.com/issues/9902). +If the journal is badly corrupted, this command might not work. If the journal +is badly corrupted, make a RADOS-level copy +(http://tracker.ceph.com/issues/9902). Dentry recovery from journal ---------------------------- If a journal is damaged or for any reason an MDS is incapable of replaying it, -attempt to recover what file metadata we can like so: +attempt to recover file metadata by running the following command: -:: +.. prompt:: bash # - cephfs-journal-tool event recover_dentries summary + cephfs-journal-tool event recover_dentries summary -This command by default acts on MDS rank 0, pass --rank=<n> to operate on other ranks. +By default, this command acts on MDS rank ``0``. Pass the option ``--rank=<n>`` +to the ``cephfs-journal-tool`` command to operate on other ranks. -This command will write any inodes/dentries recoverable from the journal -into the backing store, if these inodes/dentries are higher-versioned -than the previous contents of the backing store. If any regions of the journal -are missing/damaged, they will be skipped. +This command writes all inodes and dentries recoverable from the journal into +the backing store, but only if these inodes and dentries are higher-versioned +than the existing contents of the backing store. Any regions of the journal +that are missing or damaged will be skipped. -Note that in addition to writing out dentries and inodes, this command will update -the InoTables of each 'in' MDS rank, to indicate that any written inodes' numbers -are now in use. In simple cases, this will result in an entirely valid backing +In addition to writing out dentries and inodes, this command updates the +InoTables of each ``in`` MDS rank, to indicate that any written inodes' numbers +are now in use. In simple cases, this will result in an entirely valid backing store state. .. warning:: - The resulting state of the backing store is not guaranteed to be self-consistent, - and an online MDS scrub will be required afterwards. The journal contents - will not be modified by this command, you should truncate the journal + The resulting state of the backing store is not guaranteed to be + self-consistent, and an online MDS scrub will be required afterwards. The + journal contents will not be modified by this command. Truncate the journal separately after recovering what you can. Journal truncation diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst index b4aa56fff54..405bc727208 100644 --- a/doc/radosgw/config-ref.rst +++ b/doc/radosgw/config-ref.rst @@ -75,10 +75,11 @@ aggressiveness of lifecycle processing: .. confval:: rgw_lc_max_wp_worker These values can be tuned based upon your specific workload to further increase the -aggressiveness of lifecycle processing. For a workload with a larger number of buckets (thousands) -you would look at increasing the :confval:`rgw_lc_max_worker` value from the default value of 3 whereas for a -workload with a smaller number of buckets but higher number of objects (hundreds of thousands) -per bucket you would consider increasing :confval:`rgw_lc_max_wp_worker` from the default value of 3. +aggressiveness of lifecycle processing. For a workload with a large number of buckets (thousands) +you would raise the number of workers by increasing :confval:`rgw_lc_max_worker` +from the default value of 3. Whereas for a workload with a higher number of objects per bucket +(hundreds of thousands) you would raise the number of parallel threads +by increasing :confval:`rgw_lc_max_wp_worker` from the default value of 3. .. note:: When looking to tune either of these specific values please validate the current Cluster performance and Ceph Object Gateway utilization before increasing. diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh index aedfbc9b5cb..f7424de8ce1 100755 --- a/qa/standalone/osd/osd-bluefs-volume-ops.sh +++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh @@ -72,7 +72,7 @@ function TEST_bluestore() { truncate $dir/0/block -s 4294967296 # 4GB ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1 - truncate $dir/1/block -s 4311744512 # 4GB + 16MB + truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240 ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1 truncate $dir/2/block -s 4295099392 # 4GB + 129KB ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1 diff --git a/qa/suites/rados/verify/clusters/fixed-4.yaml b/qa/suites/rados/verify/clusters/fixed-4.yaml new file mode 120000 index 00000000000..aa88300715a --- /dev/null +++ b/qa/suites/rados/verify/clusters/fixed-4.yaml @@ -0,0 +1 @@ +.qa/clusters/fixed-4.yaml
\ No newline at end of file diff --git a/qa/suites/rgw/lua/tasks/0-install.yaml b/qa/suites/rgw/lua/tasks/0-install.yaml index fa6e279145c..d85ebcc5998 100644 --- a/qa/suites/rgw/lua/tasks/0-install.yaml +++ b/qa/suites/rgw/lua/tasks/0-install.yaml @@ -3,7 +3,7 @@ tasks: - ceph: - openssl_keys: - rgw: [client.0] -- tox: [client.0] +- tox: [client.0] overrides: ceph: @@ -11,3 +11,11 @@ overrides: global: osd_min_pg_log_entries: 10 osd_max_pg_log_entries: 10 + install: + ceph: + extra_system_packages: + rpm: + - luarocks + deb: + - liblua5.3-dev + - luarocks diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py index 9b04e3dc675..8f666d2fa9b 100644 --- a/qa/tasks/ceph.py +++ b/qa/tasks/ceph.py @@ -1206,8 +1206,18 @@ def cluster(ctx, config): args.extend([ run.Raw('|'), 'head', '-n', '1', ]) - stdout = mon0_remote.sh(args) - return stdout or None + r = mon0_remote.run( + stdout=BytesIO(), + args=args, + stderr=StringIO(), + ) + stdout = r.stdout.getvalue().decode() + if stdout: + return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr + return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_ignorelist']) is not None: diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index dab61c2c700..0cde6050718 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -475,12 +475,16 @@ def ceph_log(ctx, config): run.Raw('|'), 'head', '-n', '1', ]) r = ctx.ceph[cluster_name].bootstrap_remote.run( - stdout=StringIO(), + stdout=BytesIO(), args=args, + stderr=StringIO(), ) - stdout = r.stdout.getvalue() - if stdout != '': + stdout = r.stdout.getvalue().decode() + if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None # NOTE: technically the first and third arg to first_in_ceph_log diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py index 6cb75173966..fae5ef3bf00 100644 --- a/qa/tasks/rook.py +++ b/qa/tasks/rook.py @@ -8,7 +8,7 @@ import json import logging import os import yaml -from io import BytesIO +from io import BytesIO, StringIO from tarfile import ReadError from tasks.ceph_manager import CephManager @@ -235,10 +235,14 @@ def ceph_log(ctx, config): r = ctx.rook[cluster_name].remote.run( stdout=BytesIO(), args=args, + stderr=StringIO(), ) stdout = r.stdout.getvalue().decode() if stdout: return stdout + stderr = r.stderr.getvalue() + if stderr: + return stderr return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 5874a3dce56..e66b5aa08c7 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4167,7 +4167,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup) if (r < 0) { // fall-thru. let rdlock_path_pin_ref() check again. - } else if (is_lookup) { + } else if (is_lookup && mdr->dn[0].size()) { CDentry* dn = mdr->dn[0].back(); mdr->pin(dn); auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple()); @@ -4274,7 +4274,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup) // reply dout(10) << "reply to stat on " << *req << dendl; mdr->tracei = ref; - if (is_lookup) + if (is_lookup && mdr->dn[0].size()) mdr->tracedn = mdr->dn[0].back(); respond_to_request(mdr, 0); } diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 7332ec3edb1..833bdddc71b 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -4024,7 +4024,7 @@ void Monitor::handle_command(MonOpRequestRef op) for (auto& p : mgrstatmon()->get_service_map().services) { auto &service = p.first; - if (ServiceMap::is_normal_ceph_entity(service)) { + if (ServiceMap::is_normal_ceph_entity(service) || service == "nvmeof") { continue; } f->open_object_section(service.c_str()); diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc index 7ed5321dcda..eb04e3b8e98 100644 --- a/src/msg/async/EventEpoll.cc +++ b/src/msg/async/EventEpoll.cc @@ -17,6 +17,7 @@ #include "common/errno.h" #include <fcntl.h> #include "EventEpoll.h" +#include "Timeout.h" #define dout_subsys ceph_subsys_ms @@ -120,8 +121,7 @@ int EpollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct ti { int retval, numevents = 0; - retval = epoll_wait(epfd, events, nevent, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = epoll_wait(epfd, events, nevent, timeout_to_milliseconds(tvp)); if (retval > 0) { numevents = retval; fired_events.resize(numevents); diff --git a/src/msg/async/EventPoll.cc b/src/msg/async/EventPoll.cc index 4c09dbb4db4..f46528715e3 100644 --- a/src/msg/async/EventPoll.cc +++ b/src/msg/async/EventPoll.cc @@ -15,6 +15,7 @@ #include "common/errno.h" #include "EventPoll.h" +#include "Timeout.h" #include <unistd.h> #define dout_subsys ceph_subsys_ms @@ -161,11 +162,9 @@ int PollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tvp) { int retval, numevents = 0; #ifdef _WIN32 - retval = WSAPoll(pfds, max_pfds, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = WSAPoll(pfds, max_pfds, timeout_to_milliseconds(tvp)); #else - retval = poll(pfds, max_pfds, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = poll(pfds, max_pfds, timeout_to_milliseconds(tvp)); #endif if (retval > 0) { for (int j = 0; j < max_pfds; j++) { diff --git a/src/msg/async/Timeout.h b/src/msg/async/Timeout.h new file mode 100644 index 00000000000..b8df1b40761 --- /dev/null +++ b/src/msg/async/Timeout.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2024 IONOS SE + * + * Author: Max Kellermann <max.kellermann@ionos.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_TIMEOUT_H +#define CEPH_MSG_TIMEOUT_H + +#include "include/intarith.h" // for div_round_up() + +#include <time.h> // for struct timeval + +/** + * Convert the given `struct timeval` to milliseconds. + * + * This is supposed to be used as timeout parameter to system calls + * such as poll() and epoll_wait(). + */ +constexpr int +timeout_to_milliseconds(const struct timeval &tv) noexcept +{ + /* round up to the next millisecond so we don't wake up too early */ + return tv.tv_sec * 1000 + div_round_up(tv.tv_usec, 1000); +} + +/** + * This overload makes the timeout optional; on nullptr, it returns + * -1. + */ +constexpr int +timeout_to_milliseconds(const struct timeval *tv) noexcept +{ + return tv != nullptr ? timeout_to_milliseconds(*tv) : -1; +} + +#endif diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 2f88acdc93b..50f293d45fd 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -3794,7 +3794,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ if (offset > fnode.size) { ceph_abort_msg("truncate up not supported"); } - ceph_assert(offset <= fnode.size); + _flush_bdev(h); { std::lock_guard ll(log.lock); @@ -3803,44 +3803,42 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ vselector->sub_usage(h->file->vselector_hint, fnode); uint64_t x_off = 0; auto p = fnode.seek(offset, &x_off); - uint64_t cut_off = - (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]); - uint64_t new_allocated; - if (0 == cut_off) { - // whole pextent to remove - changed_extents = true; - new_allocated = offset; - } else if (cut_off < p->length) { - dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off); - new_allocated = (offset - x_off) + cut_off; - p->length = cut_off; - changed_extents = true; - ++p; - } else { - ceph_assert(cut_off >= p->length); - new_allocated = (offset - x_off) + p->length; - // just leave it here - ++p; - } - while (p != fnode.extents.end()) { - dirty.pending_release[p->bdev].insert(p->offset, p->length); - p = fnode.extents.erase(p); - changed_extents = true; + if (p != fnode.extents.end()) { + uint64_t cut_off = p2roundup(x_off, alloc_size[p->bdev]); + if (0 == cut_off) { + // whole pextent to remove + fnode.allocated = offset; + changed_extents = true; + } else if (cut_off < p->length) { + dirty.pending_release[p->bdev].insert(p->offset + cut_off, + p->length - cut_off); + fnode.allocated = (offset - x_off) + cut_off; + p->length = cut_off; + changed_extents = true; + ++p; + } else { + // cut_off > p->length means that we misaligned the extent + ceph_assert(cut_off == p->length); + fnode.allocated = (offset - x_off) + p->length; + ++p; // leave extent untouched + } + while (p != fnode.extents.end()) { + dirty.pending_release[p->bdev].insert(p->offset, p->length); + p = fnode.extents.erase(p); + changed_extents = true; + } } if (changed_extents) { fnode.size = offset; - fnode.allocated = new_allocated; fnode.reset_delta(); fnode.recalc_allocated(); log.t.op_file_update(fnode); // sad, but is_dirty must be set to signal flushing of the log h->file->is_dirty = true; - } else { - if (offset != fnode.size) { - fnode.size = offset; - //skipping log.t.op_file_update_inc, it will be done by flush() - h->file->is_dirty = true; - } + } else if (offset != fnode.size) { + fnode.size = offset; + // skipping log.t.op_file_update_inc, it will be done by flush() + h->file->is_dirty = true; } vselector->add_usage(h->file->vselector_hint, fnode); } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 25e6c4fe596..8f1d995fa8d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6930,8 +6930,19 @@ int BlueStore::_check_main_bdev_label() return -EIO; } if (bluestore_bdev_label_require_all && r != 0) { - derr << __func__ << " not all labels read properly" << dendl; - return -EIO; + // We are about to complain that some labels failed. + // But in case if we expanded block device some labels will not be good. + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + uint32_t valid_locations = 0; + for (uint64_t loc : bdev_label_positions) { + if (loc + lsize <= bdev_label.size) { + ++valid_locations; + } + } + if (valid_locations != bdev_label_valid_locations.size()) { + derr << __func__ << " not all labels read properly" << dendl; + return -EIO; + } } return 0; } @@ -8967,11 +8978,25 @@ int BlueStore::expand_devices(ostream& out) _close_db_and_around(); // mount in read/write to sync expansion changes + if (bdev_label_multi) { + // We need not do fsck, because we can be broken - size is increased, + // but we might not have labels set. + cct->_conf.set_val_or_die("bluestore_fsck_on_mount", "false"); + } r = _mount(); ceph_assert(r == 0); if (fm && fm->is_null_manager()) { // we grow the allocation range, must reflect it in the allocation file alloc->init_add_free(size0, size - size0); + if (bdev_label_multi) { + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + for (uint64_t loc : bdev_label_positions) { + if ((loc >= size0) && (loc + lsize <= size)) { + bdev_label_valid_locations.push_back(loc); + } + } + _write_bdev_label(cct, bdev, path + "/block", bdev_label, bdev_label_valid_locations); + } need_to_destage_allocation_file = true; } umount(); diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index b3fd526815e..8acec94f382 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -47,6 +47,7 @@ class NvmeofService(CephService): # TODO: check if we can force jinja2 to generate dicts with double quotes instead of using json.dumps transport_tcp_options = json.dumps(spec.transport_tcp_options) if spec.transport_tcp_options else None + iobuf_options = json.dumps(spec.iobuf_options) if spec.iobuf_options else None name = '{}.{}'.format(utils.name_to_config_section('nvmeof'), nvmeof_gw_id) rados_id = name[len('client.'):] if name.startswith('client.') else name @@ -67,6 +68,7 @@ class NvmeofService(CephService): 'rpc_socket_dir': '/var/tmp/', 'rpc_socket_name': 'spdk.sock', 'transport_tcp_options': transport_tcp_options, + 'iobuf_options': iobuf_options, 'rados_id': rados_id } gw_conf = self.mgr.template.render('services/nvmeof/ceph-nvmeof.conf.j2', context) diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 index 37f2db52732..2a9ab309568 100644 --- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 @@ -86,6 +86,9 @@ transport_tcp_options = {{ transport_tcp_options }} {% if spec.tgt_cmd_extra_args %} tgt_cmd_extra_args = {{ spec.tgt_cmd_extra_args }} {% endif %} +{% if iobuf_options %} +iobuf_options = {{ iobuf_options }} +{% endif %} [monitor] timeout = {{ spec.monitor_timeout }} diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx index b54ebb483c6..bcfa6777f3d 100644 --- a/src/pybind/rados/rados.pyx +++ b/src/pybind/rados/rados.pyx @@ -1870,7 +1870,7 @@ cdef class WriteOp(object): uint64_t _offset = offset with nogil: - rados_write_op_zero(self.write_op, _length, _offset) + rados_write_op_zero(self.write_op, _offset, _length) def truncate(self, offset: int): """ diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 1ac9fa49e32..6869d5b2188 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1384,6 +1384,7 @@ class NvmeofServiceSpec(ServiceSpec): transport_tcp_options: Optional[Dict[str, int]] = {"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7}, tgt_cmd_extra_args: Optional[str] = None, + iobuf_options: Optional[Dict[str, int]] = None, discovery_addr: Optional[str] = None, discovery_addr_map: Optional[Dict[str, str]] = None, discovery_port: Optional[int] = None, @@ -1520,6 +1521,8 @@ class NvmeofServiceSpec(ServiceSpec): self.transport_tcp_options: Optional[Dict[str, int]] = transport_tcp_options #: ``tgt_cmd_extra_args`` extra arguments for the nvmf_tgt process self.tgt_cmd_extra_args = tgt_cmd_extra_args + #: List of extra arguments for SPDK iobuf in the form opt=value + self.iobuf_options: Optional[Dict[str, int]] = iobuf_options #: ``discovery_addr`` address of the discovery service self.discovery_addr = discovery_addr #: ``discovery_addr_map`` per node address map of the discovery service diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc index c0a9059a251..1302f278f59 100644 --- a/src/rgw/driver/rados/rgw_data_sync.cc +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -3021,7 +3021,7 @@ public: if (!dest_bucket_perms.verify_bucket_permission(dest_key.value_or(key), rgw::IAM::s3PutObject)) { ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl; - return -EPERM; + return set_cr_error(-EPERM); } } @@ -4520,7 +4520,7 @@ public: } tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key)); } - if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM) { + if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM || retcode == -EACCES) { pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n", bs.bucket.name, key, zone_name); set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)"); diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index 69075c506f1..a183feabe2a 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -8951,7 +8951,7 @@ int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, return r; } - auto iter = attrset.find(RGW_ATTR_OLH_VER); + auto iter = attrset.find(RGW_ATTR_OLH_INFO); if (iter == attrset.end()) { /* not an olh */ return -EINVAL; } diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc index 9315dfc0afd..72216a471b3 100644 --- a/src/rgw/driver/rados/rgw_rest_log.cc +++ b/src/rgw/driver/rados/rgw_rest_log.cc @@ -1061,7 +1061,7 @@ void RGWOp_BILog_Status::execute(optional_yield y) if (!pipe.dest.bucket) { /* Uh oh, something went wrong */ - ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; + ldpp_dout(this, 0) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; op_ret = -EIO; return; } diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc index 894d8e40950..cce593c6bd5 100644 --- a/src/rgw/driver/rados/rgw_user.cc +++ b/src/rgw/driver/rados/rgw_user.cc @@ -189,6 +189,11 @@ static void dump_user_info(Formatter *f, RGWUserInfo &info, } encode_json("type", user_source_type, f); encode_json("mfa_ids", info.mfa_ids, f); + encode_json("account_id", info.account_id, f); + encode_json("path", info.path, f); + encode_json("create_date", info.create_date, f); + encode_json("tags", info.tags, f); + encode_json("group_ids", info.group_ids, f); if (stats) { encode_json("stats", *stats, f); } diff --git a/src/rgw/radosgw-admin/radosgw-admin.cc b/src/rgw/radosgw-admin/radosgw-admin.cc index 47b68d3f902..13936c87952 100644 --- a/src/rgw/radosgw-admin/radosgw-admin.cc +++ b/src/rgw/radosgw-admin/radosgw-admin.cc @@ -2543,8 +2543,8 @@ static void sync_status(Formatter *formatter) struct indented { int w; // indent width - std::string_view header; - indented(int w, std::string_view header = "") : w(w), header(header) {} + std::string header; + indented(int w, std::string header = "") : w(w), header(header) {} }; std::ostream& operator<<(std::ostream& out, const indented& h) { return out << std::setw(h.w) << h.header << std::setw(1) << ' '; @@ -2552,10 +2552,10 @@ std::ostream& operator<<(std::ostream& out, const indented& h) { struct bucket_source_sync_info { const RGWZone& _source; - std::string_view error; + std::string error; std::map<int,std::string> shards_behind; int total_shards; - std::string_view status; + std::string status; rgw_bucket bucket_source; bucket_source_sync_info(const RGWZone& source): _source(source) {} @@ -3075,14 +3075,12 @@ static int bucket_sync_status(rgw::sal::Driver* driver, const RGWBucketInfo& inf } if (pipe.source.zone.value_or(rgw_zone_id()) == z->second.id) { bucket_source_sync_info source_sync_info(z->second); - auto ret = bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second, + bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second, c->second, info, pipe, source_sync_info); - if (ret == 0) { - bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info)); - } + bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info)); } } } diff --git a/src/rgw/radosgw-admin/sync_checkpoint.cc b/src/rgw/radosgw-admin/sync_checkpoint.cc index 0303ed6c747..97da99bdc27 100644 --- a/src/rgw/radosgw-admin/sync_checkpoint.cc +++ b/src/rgw/radosgw-admin/sync_checkpoint.cc @@ -228,6 +228,7 @@ int rgw_bucket_sync_checkpoint(const DoutPrefixProvider* dpp, } auto& entry = sources.emplace_back(); entry.pipe = pipe; + entry.pipe.dest.bucket = info.bucket; // so it contains the bucket key (+bucket id) // fetch remote markers boost::asio::spawn(ioctx, [&] (boost::asio::yield_context yield) { diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 99f7db4f569..88f5f7a9c52 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -1428,6 +1428,7 @@ struct RGWBucketEnt { size_t size; size_t size_rounded; ceph::real_time creation_time; + ceph::real_time modification_time; uint64_t count; /* The placement_rule is necessary to calculate per-storage-policy statics diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 9f25d275852..d6f846b0d2f 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -8587,6 +8587,10 @@ void RGWGetBucketPolicy::execute(optional_yield y) void RGWDeleteBucketPolicy::send_response() { + if (!op_ret) { + /* A successful Delete Bucket Policy should return a 204 on success */ + op_ret = STATUS_NO_CONTENT; + } if (op_ret) { set_req_state_err(s, op_ret); } @@ -9262,4 +9266,3 @@ void rgw_slo_entry::decode_json(JSONObj *obj) JSONDecoder::decode_json("etag", etag, obj); JSONDecoder::decode_json("size_bytes", size_bytes, obj); }; - diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index b8ff3ca2fe8..88af0fc9c27 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -361,6 +361,7 @@ void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& ent) if (need_stats) { s->formatter->dump_int("count", ent.count); s->formatter->dump_int("bytes", ent.size); + dump_time(s, "last_modified", ent.modification_time); } s->formatter->close_section(); diff --git a/src/rgw/services/svc_bucket_sobj.cc b/src/rgw/services/svc_bucket_sobj.cc index ca705c5a44d..0f4cd4e847b 100644 --- a/src/rgw/services/svc_bucket_sobj.cc +++ b/src/rgw/services/svc_bucket_sobj.cc @@ -556,7 +556,7 @@ int RGWSI_Bucket_SObj::read_bucket_stats(const rgw_bucket& bucket, const DoutPrefixProvider *dpp) { RGWBucketInfo bucket_info; - int ret = read_bucket_info(bucket, &bucket_info, nullptr, nullptr, boost::none, y, dpp); + int ret = read_bucket_info(bucket, &bucket_info, &ent->modification_time, nullptr, boost::none, y, dpp); if (ret < 0) { return ret; } diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 60147b5397c..32173d61afe 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -1426,6 +1426,87 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) { } } +TEST(BlueFS, truncate_drops_allocations) { + constexpr uint64_t K = 1024; + constexpr uint64_t M = 1024 * K; + uuid_d fsid; + const char* DIR_NAME="dir"; + const char* FILE_NAME="file1"; + struct { + uint64_t preallocated_size; + uint64_t write_size; + uint64_t truncate_to; + uint64_t allocated_after_truncate; + uint64_t slow_size = 0; + uint64_t slow_alloc_size = 64*K; + uint64_t db_size = 128*M; + uint64_t db_alloc_size = 1*M; + } scenarios [] = { + // on DB(which is SLOW) : 1 => 1, 64K remains + { 1*M, 1, 1, 64*K }, + // on DB(which is SLOW), alloc 4K : 1 => 1, 4K remains + { 1*M, 1, 1, 4*K, 0, 4*K }, + // on DB(which is SLOW), truncation on AU boundary : 128K => 128K, 128K remains + { 1*M, 128*K, 128*K, 128*K }, + // on DB(which is SLOW), no prealloc, truncation to 0 : 1666K => 0, 0 remains + { 0, 1666*K, 0, 0 }, + // on DB, truncate to 123K, expect 1M occupied + { 1234*K, 123*K, 123*K, 1*M, 128*M, 64*K, 10*M, 1*M }, + // on DB, truncate to 0, expect 0 occupied + { 1234*K, 345*K, 0, 0, 128*M, 64*K, 10*M, 1*M }, + // on DB, truncate to AU boundary, expect exactly 1M occupied + { 1234*K, 1123*K, 1*M, 1*M, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, truncate only data on SLOW + { 0, 10*M+1, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, preallocate and truncate only data on SLOW + { 6*M, 12*M, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, preallocate and truncate all in SLOW and some on DB + // note! prealloc 6M is important, one allocation for 12M will fallback to SLOW + // in 6M + 6M we can be sure that 6M is on DB and 6M is on SLOW + { 6*M, 12*M, 3*M+1, 4*M, 128*M, 64*K, 11*M, 1*M }, + }; + for (auto& s : scenarios) { + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_shared_alloc_size", stringify(s.slow_alloc_size).c_str()); + conf.SetVal("bluefs_alloc_size", stringify(s.db_alloc_size).c_str()); + + g_ceph_context->_conf.set_val("bluefs_shared_alloc_size", stringify(s.slow_alloc_size)); + g_ceph_context->_conf.set_val("bluefs_alloc_size", stringify(s.db_alloc_size)); + TempBdev bdev_db{s.db_size}; + TempBdev bdev_slow{s.slow_size}; + + BlueFS fs(g_ceph_context); + if (s.db_size != 0) { + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0)); + } + if (s.slow_size != 0) { + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0)); + } + + ASSERT_EQ(0, fs.mkfs(fsid, {BlueFS::BDEV_DB, false, false})); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({BlueFS::BDEV_DB, false, false})); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false)); + uint64_t pre = fs.get_used(); + ASSERT_EQ(0, fs.preallocate(h->file, 0, s.preallocated_size)); + const std::string content(s.write_size, 'x'); + h->append(content.c_str(), content.length()); + fs.fsync(h); + ASSERT_EQ(0, fs.truncate(h, s.truncate_to)); + fs.fsync(h); + uint64_t post = fs.get_used(); + fs.close_writer(h); + EXPECT_EQ(pre, post - s.allocated_after_truncate); + + fs.umount(); + } +} + + + + TEST(BlueFS, test_log_runway) { uint64_t max_log_runway = 65536; ConfSaver conf(g_ceph_context->_conf); diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py index 25423bd8dcb..881b29c9152 100644 --- a/src/test/pybind/test_rados.py +++ b/src/test/pybind/test_rados.py @@ -516,6 +516,11 @@ class TestIoctx(object): eq(self.ioctx.read('write_ops'), b'12\x00\x005') write_op.write_full(b'12345') + write_op.zero(0, 2) + self.ioctx.operate_write_op(write_op, "write_ops") + eq(self.ioctx.read('write_ops'), b'\x00\x00345') + + write_op.write_full(b'12345') write_op.truncate(2) self.ioctx.operate_write_op(write_op, "write_ops") eq(self.ioctx.read('write_ops'), b'12') |