summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.githubmap1
-rw-r--r--.mailmap1
-rw-r--r--.organizationmap1
-rw-r--r--.peoplemap1
-rw-r--r--PendingReleaseNotes4
-rw-r--r--container/Containerfile1
-rw-r--r--debian/control3
-rw-r--r--doc/cephfs/disaster-recovery-experts.rst41
-rw-r--r--doc/radosgw/config-ref.rst9
-rwxr-xr-xqa/standalone/osd/osd-bluefs-volume-ops.sh2
l---------qa/suites/rados/verify/clusters/fixed-4.yaml1
-rw-r--r--qa/suites/rgw/lua/tasks/0-install.yaml10
-rw-r--r--qa/tasks/ceph.py14
-rw-r--r--qa/tasks/cephadm.py10
-rw-r--r--qa/tasks/rook.py6
-rw-r--r--src/mds/Server.cc4
-rw-r--r--src/mon/Monitor.cc2
-rw-r--r--src/msg/async/EventEpoll.cc4
-rw-r--r--src/msg/async/EventPoll.cc7
-rw-r--r--src/msg/async/Timeout.h47
-rw-r--r--src/os/bluestore/BlueFS.cc60
-rw-r--r--src/os/bluestore/BlueStore.cc29
-rw-r--r--src/pybind/mgr/cephadm/services/nvmeof.py2
-rw-r--r--src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j23
-rw-r--r--src/pybind/rados/rados.pyx2
-rw-r--r--src/python-common/ceph/deployment/service_spec.py3
-rw-r--r--src/rgw/driver/rados/rgw_data_sync.cc4
-rw-r--r--src/rgw/driver/rados/rgw_rados.cc2
-rw-r--r--src/rgw/driver/rados/rgw_user.cc5
-rw-r--r--src/rgw/radosgw-admin/radosgw-admin.cc14
-rw-r--r--src/rgw/rgw_common.h1
-rw-r--r--src/rgw/rgw_op.cc5
-rw-r--r--src/rgw/rgw_rest_swift.cc1
-rw-r--r--src/rgw/services/svc_bucket_sobj.cc2
-rw-r--r--src/test/objectstore/test_bluefs.cc81
-rw-r--r--src/test/pybind/test_rados.py5
36 files changed, 300 insertions, 88 deletions
diff --git a/.githubmap b/.githubmap
index 5265fa59bed..c8ae6e284a2 100644
--- a/.githubmap
+++ b/.githubmap
@@ -9,6 +9,7 @@
#
a2batic Kanika Murarka <kmurarka@redhat.com>
aaSharma14 Aashish Sharma <aasharma@redhat.com>
+abhishek-kane Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com>
aclamk Adam Kupczyk <akupczyk@redhat.com>
adamemerson Adam C. Emerson <aemerson@redhat.com>
adk3798 Adam King <adking@redhat.com>
diff --git a/.mailmap b/.mailmap
index 6322c4ba523..e111f70a3d0 100644
--- a/.mailmap
+++ b/.mailmap
@@ -13,6 +13,7 @@ Aashish Sharma <aasharma@redhat.com> <66050535+aaSharma14@users.noreply.github.c
Aashish Sharma <aasharma@redhat.com> <aasharma@li-e74156cc-2f67-11b2-a85c-e98659a63c5c.ibm.com>
Aashish Sharma <aasharma@redhat.com> <aashishsharma@fedora.redhat.com>
Aashish Sharma <aasharma@redhat.com> <aashishsharma@localhost.localdomain>
+Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com>
Abhishek Lekshmanan <abhishek.lekshmanan@cern.ch> <abhishek.l@cern.ch>
Abhishek Lekshmanan <abhishek@suse.com> <abhishek.lekshmanan@gmail.com>
Abhishek Lekshmanan <abhishek@suse.com> <alekshmanan@suse.com>
diff --git a/.organizationmap b/.organizationmap
index e59e6ae24e1..ac9b0ea70fe 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -345,6 +345,7 @@ Huawei <contact@huawei.com> Yehu <yehu5@huawei.com>
Huayun <contact@huayun.com> Zheng Yin <zhengyin@huayun.com>
Huazhong University of Science and Technology <contact@hust.edu.cn> Luo Runbing <runsisi@hust.edu.cn>
HXT Semiconductor <contact@hxt-semitech.org> Jiang Yutang <yutang2.jiang@hxt-semitech.com>
+IBM <contact@IBM.com> Abhishek Kane <abhishek.kane@ibm.com>
IBM <contact@IBM.com> Adam Kupczyk <akupczyk@ibm.com>
IBM <contact@IBM.com> Afreen Misbah <afreen@ibm.com>
IBM <contact@IBM.com> Aliaksei Makarau <aliaksei.makarau@ibm.com>
diff --git a/.peoplemap b/.peoplemap
index 418e8505fb4..ed70830c092 100644
--- a/.peoplemap
+++ b/.peoplemap
@@ -16,6 +16,7 @@
#
# git log --pretty='%aN <%aE>' $range | git -c mailmap.file=.peoplemap check-mailmap --stdin | sort | uniq | sed -e 's/\(.*\) \(<.*\)/\2 \1/' | uniq --skip-field=1 --all-repeated | sed -e 's/\(.*>\) \(.*\)/\2 \1/'
#
+Abhishek Kane <abhishek.kane@ibm.com> <abhishek.kane@gmail.com>
Abhishek Lekshmanan <abhishek.lekshmanan@cern.ch> <abhishek@suse.com>
Adam Kupczyk <akupczyk@ibm.com> <akupczyk@redhat.com> <akupczyk@mirantis.com>
Alexandre Marangone <amarango@redhat.com> Alexandre Marangone <alexandre.marangone@inktank.com>
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index b4824a65584..d25acfa9c6d 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -60,6 +60,10 @@
fuse client for `fallocate` for the default case (i.e. mode == 0) since
CephFS does not support disk space reservation. The only flags supported are
`FALLOC_FL_KEEP_SIZE` and `FALLOC_FL_PUNCH_HOLE`.
+* pybind/rados: Fixes WriteOp.zero() in the original reversed order of arguments
+ `offset` and `length`. When pybind calls WriteOp.zero(), the argument passed
+ does not match rados_write_op_zero, and offset and length are swapped, which
+ results in an unexpected response.
* The HeadBucket API now reports the `X-RGW-Bytes-Used` and `X-RGW-Object-Count`
headers only when the `read-stats` querystring is explicitly included in the
diff --git a/container/Containerfile b/container/Containerfile
index c954ebed1be..9a5a88e76a1 100644
--- a/container/Containerfile
+++ b/container/Containerfile
@@ -212,6 +212,7 @@ RUN rpm -q $(cat packages.txt) && rm -f /var/lib/rpm/__db* && rm -f *packages.tx
# Set some envs in the container for quickly inspecting details about the build at runtime
ENV CEPH_IS_DEVEL="${CI_CONTAINER}" \
CEPH_REF="${CEPH_REF}" \
+ CEPH_VERSION="${CEPH_REF}" \
CEPH_OSD_FLAVOR="${OSD_FLAVOR}" \
FROM_IMAGE="${FROM_IMAGE}"
diff --git a/debian/control b/debian/control
index a7d2dbb4c3a..a8c79f7a731 100644
--- a/debian/control
+++ b/debian/control
@@ -999,7 +999,8 @@ Depends: librados2 (= ${binary:Version}),
liblua5.3-0,
${misc:Depends},
${shlibs:Depends},
-Suggests: luarocks,
+Suggests: liblua5.3-dev,
+ luarocks,
Description: RADOS Gateway client library
RADOS is a distributed object store used by the Ceph distributed
storage system. This package provides a REST gateway to the
diff --git a/doc/cephfs/disaster-recovery-experts.rst b/doc/cephfs/disaster-recovery-experts.rst
index 7677b42f47e..b01a3dfde6a 100644
--- a/doc/cephfs/disaster-recovery-experts.rst
+++ b/doc/cephfs/disaster-recovery-experts.rst
@@ -21,43 +21,46 @@ Advanced: Metadata repair tools
Journal export
--------------
-Before attempting dangerous operations, make a copy of the journal like so:
+Before attempting any dangerous operation, make a copy of the journal by
+running the following command:
-::
+.. prompt:: bash #
- cephfs-journal-tool journal export backup.bin
+ cephfs-journal-tool journal export backup.bin
-Note that this command may not always work if the journal is badly corrupted,
-in which case a RADOS-level copy should be made (http://tracker.ceph.com/issues/9902).
+If the journal is badly corrupted, this command might not work. If the journal
+is badly corrupted, make a RADOS-level copy
+(http://tracker.ceph.com/issues/9902).
Dentry recovery from journal
----------------------------
If a journal is damaged or for any reason an MDS is incapable of replaying it,
-attempt to recover what file metadata we can like so:
+attempt to recover file metadata by running the following command:
-::
+.. prompt:: bash #
- cephfs-journal-tool event recover_dentries summary
+ cephfs-journal-tool event recover_dentries summary
-This command by default acts on MDS rank 0, pass --rank=<n> to operate on other ranks.
+By default, this command acts on MDS rank ``0``. Pass the option ``--rank=<n>``
+to the ``cephfs-journal-tool`` command to operate on other ranks.
-This command will write any inodes/dentries recoverable from the journal
-into the backing store, if these inodes/dentries are higher-versioned
-than the previous contents of the backing store. If any regions of the journal
-are missing/damaged, they will be skipped.
+This command writes all inodes and dentries recoverable from the journal into
+the backing store, but only if these inodes and dentries are higher-versioned
+than the existing contents of the backing store. Any regions of the journal
+that are missing or damaged will be skipped.
-Note that in addition to writing out dentries and inodes, this command will update
-the InoTables of each 'in' MDS rank, to indicate that any written inodes' numbers
-are now in use. In simple cases, this will result in an entirely valid backing
+In addition to writing out dentries and inodes, this command updates the
+InoTables of each ``in`` MDS rank, to indicate that any written inodes' numbers
+are now in use. In simple cases, this will result in an entirely valid backing
store state.
.. warning::
- The resulting state of the backing store is not guaranteed to be self-consistent,
- and an online MDS scrub will be required afterwards. The journal contents
- will not be modified by this command, you should truncate the journal
+ The resulting state of the backing store is not guaranteed to be
+ self-consistent, and an online MDS scrub will be required afterwards. The
+ journal contents will not be modified by this command. Truncate the journal
separately after recovering what you can.
Journal truncation
diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst
index b4aa56fff54..405bc727208 100644
--- a/doc/radosgw/config-ref.rst
+++ b/doc/radosgw/config-ref.rst
@@ -75,10 +75,11 @@ aggressiveness of lifecycle processing:
.. confval:: rgw_lc_max_wp_worker
These values can be tuned based upon your specific workload to further increase the
-aggressiveness of lifecycle processing. For a workload with a larger number of buckets (thousands)
-you would look at increasing the :confval:`rgw_lc_max_worker` value from the default value of 3 whereas for a
-workload with a smaller number of buckets but higher number of objects (hundreds of thousands)
-per bucket you would consider increasing :confval:`rgw_lc_max_wp_worker` from the default value of 3.
+aggressiveness of lifecycle processing. For a workload with a large number of buckets (thousands)
+you would raise the number of workers by increasing :confval:`rgw_lc_max_worker`
+from the default value of 3. Whereas for a workload with a higher number of objects per bucket
+(hundreds of thousands) you would raise the number of parallel threads
+by increasing :confval:`rgw_lc_max_wp_worker` from the default value of 3.
.. note:: When looking to tune either of these specific values please validate the
current Cluster performance and Ceph Object Gateway utilization before increasing.
diff --git a/qa/standalone/osd/osd-bluefs-volume-ops.sh b/qa/standalone/osd/osd-bluefs-volume-ops.sh
index aedfbc9b5cb..f7424de8ce1 100755
--- a/qa/standalone/osd/osd-bluefs-volume-ops.sh
+++ b/qa/standalone/osd/osd-bluefs-volume-ops.sh
@@ -72,7 +72,7 @@ function TEST_bluestore() {
truncate $dir/0/block -s 4294967296 # 4GB
ceph-bluestore-tool --path $dir/0 bluefs-bdev-expand || return 1
- truncate $dir/1/block -s 4311744512 # 4GB + 16MB
+ truncate $dir/1/block -s 11811160064 # 11GB to get bdev label at 10737418240
ceph-bluestore-tool --path $dir/1 bluefs-bdev-expand || return 1
truncate $dir/2/block -s 4295099392 # 4GB + 129KB
ceph-bluestore-tool --path $dir/2 bluefs-bdev-expand || return 1
diff --git a/qa/suites/rados/verify/clusters/fixed-4.yaml b/qa/suites/rados/verify/clusters/fixed-4.yaml
new file mode 120000
index 00000000000..aa88300715a
--- /dev/null
+++ b/qa/suites/rados/verify/clusters/fixed-4.yaml
@@ -0,0 +1 @@
+.qa/clusters/fixed-4.yaml \ No newline at end of file
diff --git a/qa/suites/rgw/lua/tasks/0-install.yaml b/qa/suites/rgw/lua/tasks/0-install.yaml
index fa6e279145c..d85ebcc5998 100644
--- a/qa/suites/rgw/lua/tasks/0-install.yaml
+++ b/qa/suites/rgw/lua/tasks/0-install.yaml
@@ -3,7 +3,7 @@ tasks:
- ceph:
- openssl_keys:
- rgw: [client.0]
-- tox: [client.0]
+- tox: [client.0]
overrides:
ceph:
@@ -11,3 +11,11 @@ overrides:
global:
osd_min_pg_log_entries: 10
osd_max_pg_log_entries: 10
+ install:
+ ceph:
+ extra_system_packages:
+ rpm:
+ - luarocks
+ deb:
+ - liblua5.3-dev
+ - luarocks
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index 9b04e3dc675..8f666d2fa9b 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -1206,8 +1206,18 @@ def cluster(ctx, config):
args.extend([
run.Raw('|'), 'head', '-n', '1',
])
- stdout = mon0_remote.sh(args)
- return stdout or None
+ r = mon0_remote.run(
+ stdout=BytesIO(),
+ args=args,
+ stderr=StringIO(),
+ )
+ stdout = r.stdout.getvalue().decode()
+ if stdout:
+ return stdout
+ stderr = r.stderr.getvalue()
+ if stderr:
+ return stderr
+ return None
if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
config['log_ignorelist']) is not None:
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py
index dab61c2c700..0cde6050718 100644
--- a/qa/tasks/cephadm.py
+++ b/qa/tasks/cephadm.py
@@ -475,12 +475,16 @@ def ceph_log(ctx, config):
run.Raw('|'), 'head', '-n', '1',
])
r = ctx.ceph[cluster_name].bootstrap_remote.run(
- stdout=StringIO(),
+ stdout=BytesIO(),
args=args,
+ stderr=StringIO(),
)
- stdout = r.stdout.getvalue()
- if stdout != '':
+ stdout = r.stdout.getvalue().decode()
+ if stdout:
return stdout
+ stderr = r.stderr.getvalue()
+ if stderr:
+ return stderr
return None
# NOTE: technically the first and third arg to first_in_ceph_log
diff --git a/qa/tasks/rook.py b/qa/tasks/rook.py
index 6cb75173966..fae5ef3bf00 100644
--- a/qa/tasks/rook.py
+++ b/qa/tasks/rook.py
@@ -8,7 +8,7 @@ import json
import logging
import os
import yaml
-from io import BytesIO
+from io import BytesIO, StringIO
from tarfile import ReadError
from tasks.ceph_manager import CephManager
@@ -235,10 +235,14 @@ def ceph_log(ctx, config):
r = ctx.rook[cluster_name].remote.run(
stdout=BytesIO(),
args=args,
+ stderr=StringIO(),
)
stdout = r.stdout.getvalue().decode()
if stdout:
return stdout
+ stderr = r.stderr.getvalue()
+ if stderr:
+ return stderr
return None
if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 5874a3dce56..e66b5aa08c7 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4167,7 +4167,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup)
if (r < 0) {
// fall-thru. let rdlock_path_pin_ref() check again.
- } else if (is_lookup) {
+ } else if (is_lookup && mdr->dn[0].size()) {
CDentry* dn = mdr->dn[0].back();
mdr->pin(dn);
auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
@@ -4274,7 +4274,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup)
// reply
dout(10) << "reply to stat on " << *req << dendl;
mdr->tracei = ref;
- if (is_lookup)
+ if (is_lookup && mdr->dn[0].size())
mdr->tracedn = mdr->dn[0].back();
respond_to_request(mdr, 0);
}
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 7332ec3edb1..833bdddc71b 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -4024,7 +4024,7 @@ void Monitor::handle_command(MonOpRequestRef op)
for (auto& p : mgrstatmon()->get_service_map().services) {
auto &service = p.first;
- if (ServiceMap::is_normal_ceph_entity(service)) {
+ if (ServiceMap::is_normal_ceph_entity(service) || service == "nvmeof") {
continue;
}
f->open_object_section(service.c_str());
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
index 7ed5321dcda..eb04e3b8e98 100644
--- a/src/msg/async/EventEpoll.cc
+++ b/src/msg/async/EventEpoll.cc
@@ -17,6 +17,7 @@
#include "common/errno.h"
#include <fcntl.h>
#include "EventEpoll.h"
+#include "Timeout.h"
#define dout_subsys ceph_subsys_ms
@@ -120,8 +121,7 @@ int EpollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct ti
{
int retval, numevents = 0;
- retval = epoll_wait(epfd, events, nevent,
- tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+ retval = epoll_wait(epfd, events, nevent, timeout_to_milliseconds(tvp));
if (retval > 0) {
numevents = retval;
fired_events.resize(numevents);
diff --git a/src/msg/async/EventPoll.cc b/src/msg/async/EventPoll.cc
index 4c09dbb4db4..f46528715e3 100644
--- a/src/msg/async/EventPoll.cc
+++ b/src/msg/async/EventPoll.cc
@@ -15,6 +15,7 @@
#include "common/errno.h"
#include "EventPoll.h"
+#include "Timeout.h"
#include <unistd.h>
#define dout_subsys ceph_subsys_ms
@@ -161,11 +162,9 @@ int PollDriver::event_wait(std::vector<FiredFileEvent> &fired_events,
struct timeval *tvp) {
int retval, numevents = 0;
#ifdef _WIN32
- retval = WSAPoll(pfds, max_pfds,
- tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+ retval = WSAPoll(pfds, max_pfds, timeout_to_milliseconds(tvp));
#else
- retval = poll(pfds, max_pfds,
- tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+ retval = poll(pfds, max_pfds, timeout_to_milliseconds(tvp));
#endif
if (retval > 0) {
for (int j = 0; j < max_pfds; j++) {
diff --git a/src/msg/async/Timeout.h b/src/msg/async/Timeout.h
new file mode 100644
index 00000000000..b8df1b40761
--- /dev/null
+++ b/src/msg/async/Timeout.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2024 IONOS SE
+ *
+ * Author: Max Kellermann <max.kellermann@ionos.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_TIMEOUT_H
+#define CEPH_MSG_TIMEOUT_H
+
+#include "include/intarith.h" // for div_round_up()
+
+#include <time.h> // for struct timeval
+
+/**
+ * Convert the given `struct timeval` to milliseconds.
+ *
+ * This is supposed to be used as timeout parameter to system calls
+ * such as poll() and epoll_wait().
+ */
+constexpr int
+timeout_to_milliseconds(const struct timeval &tv) noexcept
+{
+ /* round up to the next millisecond so we don't wake up too early */
+ return tv.tv_sec * 1000 + div_round_up(tv.tv_usec, 1000);
+}
+
+/**
+ * This overload makes the timeout optional; on nullptr, it returns
+ * -1.
+ */
+constexpr int
+timeout_to_milliseconds(const struct timeval *tv) noexcept
+{
+ return tv != nullptr ? timeout_to_milliseconds(*tv) : -1;
+}
+
+#endif
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 2f88acdc93b..50f293d45fd 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -3794,7 +3794,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
if (offset > fnode.size) {
ceph_abort_msg("truncate up not supported");
}
- ceph_assert(offset <= fnode.size);
+
_flush_bdev(h);
{
std::lock_guard ll(log.lock);
@@ -3803,44 +3803,42 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
vselector->sub_usage(h->file->vselector_hint, fnode);
uint64_t x_off = 0;
auto p = fnode.seek(offset, &x_off);
- uint64_t cut_off =
- (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
- uint64_t new_allocated;
- if (0 == cut_off) {
- // whole pextent to remove
- changed_extents = true;
- new_allocated = offset;
- } else if (cut_off < p->length) {
- dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
- new_allocated = (offset - x_off) + cut_off;
- p->length = cut_off;
- changed_extents = true;
- ++p;
- } else {
- ceph_assert(cut_off >= p->length);
- new_allocated = (offset - x_off) + p->length;
- // just leave it here
- ++p;
- }
- while (p != fnode.extents.end()) {
- dirty.pending_release[p->bdev].insert(p->offset, p->length);
- p = fnode.extents.erase(p);
- changed_extents = true;
+ if (p != fnode.extents.end()) {
+ uint64_t cut_off = p2roundup(x_off, alloc_size[p->bdev]);
+ if (0 == cut_off) {
+ // whole pextent to remove
+ fnode.allocated = offset;
+ changed_extents = true;
+ } else if (cut_off < p->length) {
+ dirty.pending_release[p->bdev].insert(p->offset + cut_off,
+ p->length - cut_off);
+ fnode.allocated = (offset - x_off) + cut_off;
+ p->length = cut_off;
+ changed_extents = true;
+ ++p;
+ } else {
+ // cut_off > p->length means that we misaligned the extent
+ ceph_assert(cut_off == p->length);
+ fnode.allocated = (offset - x_off) + p->length;
+ ++p; // leave extent untouched
+ }
+ while (p != fnode.extents.end()) {
+ dirty.pending_release[p->bdev].insert(p->offset, p->length);
+ p = fnode.extents.erase(p);
+ changed_extents = true;
+ }
}
if (changed_extents) {
fnode.size = offset;
- fnode.allocated = new_allocated;
fnode.reset_delta();
fnode.recalc_allocated();
log.t.op_file_update(fnode);
// sad, but is_dirty must be set to signal flushing of the log
h->file->is_dirty = true;
- } else {
- if (offset != fnode.size) {
- fnode.size = offset;
- //skipping log.t.op_file_update_inc, it will be done by flush()
- h->file->is_dirty = true;
- }
+ } else if (offset != fnode.size) {
+ fnode.size = offset;
+ // skipping log.t.op_file_update_inc, it will be done by flush()
+ h->file->is_dirty = true;
}
vselector->add_usage(h->file->vselector_hint, fnode);
}
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 25e6c4fe596..8f1d995fa8d 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -6930,8 +6930,19 @@ int BlueStore::_check_main_bdev_label()
return -EIO;
}
if (bluestore_bdev_label_require_all && r != 0) {
- derr << __func__ << " not all labels read properly" << dendl;
- return -EIO;
+ // We are about to complain that some labels failed.
+ // But in case if we expanded block device some labels will not be good.
+ uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
+ uint32_t valid_locations = 0;
+ for (uint64_t loc : bdev_label_positions) {
+ if (loc + lsize <= bdev_label.size) {
+ ++valid_locations;
+ }
+ }
+ if (valid_locations != bdev_label_valid_locations.size()) {
+ derr << __func__ << " not all labels read properly" << dendl;
+ return -EIO;
+ }
}
return 0;
}
@@ -8967,11 +8978,25 @@ int BlueStore::expand_devices(ostream& out)
_close_db_and_around();
// mount in read/write to sync expansion changes
+ if (bdev_label_multi) {
+ // We need not do fsck, because we can be broken - size is increased,
+ // but we might not have labels set.
+ cct->_conf.set_val_or_die("bluestore_fsck_on_mount", "false");
+ }
r = _mount();
ceph_assert(r == 0);
if (fm && fm->is_null_manager()) {
// we grow the allocation range, must reflect it in the allocation file
alloc->init_add_free(size0, size - size0);
+ if (bdev_label_multi) {
+ uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
+ for (uint64_t loc : bdev_label_positions) {
+ if ((loc >= size0) && (loc + lsize <= size)) {
+ bdev_label_valid_locations.push_back(loc);
+ }
+ }
+ _write_bdev_label(cct, bdev, path + "/block", bdev_label, bdev_label_valid_locations);
+ }
need_to_destage_allocation_file = true;
}
umount();
diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py
index b3fd526815e..8acec94f382 100644
--- a/src/pybind/mgr/cephadm/services/nvmeof.py
+++ b/src/pybind/mgr/cephadm/services/nvmeof.py
@@ -47,6 +47,7 @@ class NvmeofService(CephService):
# TODO: check if we can force jinja2 to generate dicts with double quotes instead of using json.dumps
transport_tcp_options = json.dumps(spec.transport_tcp_options) if spec.transport_tcp_options else None
+ iobuf_options = json.dumps(spec.iobuf_options) if spec.iobuf_options else None
name = '{}.{}'.format(utils.name_to_config_section('nvmeof'), nvmeof_gw_id)
rados_id = name[len('client.'):] if name.startswith('client.') else name
@@ -67,6 +68,7 @@ class NvmeofService(CephService):
'rpc_socket_dir': '/var/tmp/',
'rpc_socket_name': 'spdk.sock',
'transport_tcp_options': transport_tcp_options,
+ 'iobuf_options': iobuf_options,
'rados_id': rados_id
}
gw_conf = self.mgr.template.render('services/nvmeof/ceph-nvmeof.conf.j2', context)
diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
index 37f2db52732..2a9ab309568 100644
--- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
+++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2
@@ -86,6 +86,9 @@ transport_tcp_options = {{ transport_tcp_options }}
{% if spec.tgt_cmd_extra_args %}
tgt_cmd_extra_args = {{ spec.tgt_cmd_extra_args }}
{% endif %}
+{% if iobuf_options %}
+iobuf_options = {{ iobuf_options }}
+{% endif %}
[monitor]
timeout = {{ spec.monitor_timeout }}
diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx
index b54ebb483c6..bcfa6777f3d 100644
--- a/src/pybind/rados/rados.pyx
+++ b/src/pybind/rados/rados.pyx
@@ -1870,7 +1870,7 @@ cdef class WriteOp(object):
uint64_t _offset = offset
with nogil:
- rados_write_op_zero(self.write_op, _length, _offset)
+ rados_write_op_zero(self.write_op, _offset, _length)
def truncate(self, offset: int):
"""
diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py
index 1ac9fa49e32..6869d5b2188 100644
--- a/src/python-common/ceph/deployment/service_spec.py
+++ b/src/python-common/ceph/deployment/service_spec.py
@@ -1384,6 +1384,7 @@ class NvmeofServiceSpec(ServiceSpec):
transport_tcp_options: Optional[Dict[str, int]] =
{"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7},
tgt_cmd_extra_args: Optional[str] = None,
+ iobuf_options: Optional[Dict[str, int]] = None,
discovery_addr: Optional[str] = None,
discovery_addr_map: Optional[Dict[str, str]] = None,
discovery_port: Optional[int] = None,
@@ -1520,6 +1521,8 @@ class NvmeofServiceSpec(ServiceSpec):
self.transport_tcp_options: Optional[Dict[str, int]] = transport_tcp_options
#: ``tgt_cmd_extra_args`` extra arguments for the nvmf_tgt process
self.tgt_cmd_extra_args = tgt_cmd_extra_args
+ #: List of extra arguments for SPDK iobuf in the form opt=value
+ self.iobuf_options: Optional[Dict[str, int]] = iobuf_options
#: ``discovery_addr`` address of the discovery service
self.discovery_addr = discovery_addr
#: ``discovery_addr_map`` per node address map of the discovery service
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
index c0a9059a251..1302f278f59 100644
--- a/src/rgw/driver/rados/rgw_data_sync.cc
+++ b/src/rgw/driver/rados/rgw_data_sync.cc
@@ -3021,7 +3021,7 @@ public:
if (!dest_bucket_perms.verify_bucket_permission(dest_key.value_or(key), rgw::IAM::s3PutObject)) {
ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl;
- return -EPERM;
+ return set_cr_error(-EPERM);
}
}
@@ -4520,7 +4520,7 @@ public:
}
tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key));
}
- if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM) {
+ if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM || retcode == -EACCES) {
pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n",
bs.bucket.name, key, zone_name);
set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
index 69075c506f1..a183feabe2a 100644
--- a/src/rgw/driver/rados/rgw_rados.cc
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -8951,7 +8951,7 @@ int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
return r;
}
- auto iter = attrset.find(RGW_ATTR_OLH_VER);
+ auto iter = attrset.find(RGW_ATTR_OLH_INFO);
if (iter == attrset.end()) { /* not an olh */
return -EINVAL;
}
diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc
index 894d8e40950..cce593c6bd5 100644
--- a/src/rgw/driver/rados/rgw_user.cc
+++ b/src/rgw/driver/rados/rgw_user.cc
@@ -189,6 +189,11 @@ static void dump_user_info(Formatter *f, RGWUserInfo &info,
}
encode_json("type", user_source_type, f);
encode_json("mfa_ids", info.mfa_ids, f);
+ encode_json("account_id", info.account_id, f);
+ encode_json("path", info.path, f);
+ encode_json("create_date", info.create_date, f);
+ encode_json("tags", info.tags, f);
+ encode_json("group_ids", info.group_ids, f);
if (stats) {
encode_json("stats", *stats, f);
}
diff --git a/src/rgw/radosgw-admin/radosgw-admin.cc b/src/rgw/radosgw-admin/radosgw-admin.cc
index 47b68d3f902..13936c87952 100644
--- a/src/rgw/radosgw-admin/radosgw-admin.cc
+++ b/src/rgw/radosgw-admin/radosgw-admin.cc
@@ -2543,8 +2543,8 @@ static void sync_status(Formatter *formatter)
struct indented {
int w; // indent width
- std::string_view header;
- indented(int w, std::string_view header = "") : w(w), header(header) {}
+ std::string header;
+ indented(int w, std::string header = "") : w(w), header(header) {}
};
std::ostream& operator<<(std::ostream& out, const indented& h) {
return out << std::setw(h.w) << h.header << std::setw(1) << ' ';
@@ -2552,10 +2552,10 @@ std::ostream& operator<<(std::ostream& out, const indented& h) {
struct bucket_source_sync_info {
const RGWZone& _source;
- std::string_view error;
+ std::string error;
std::map<int,std::string> shards_behind;
int total_shards;
- std::string_view status;
+ std::string status;
rgw_bucket bucket_source;
bucket_source_sync_info(const RGWZone& source): _source(source) {}
@@ -3075,14 +3075,12 @@ static int bucket_sync_status(rgw::sal::Driver* driver, const RGWBucketInfo& inf
}
if (pipe.source.zone.value_or(rgw_zone_id()) == z->second.id) {
bucket_source_sync_info source_sync_info(z->second);
- auto ret = bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second,
+ bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second,
c->second,
info, pipe,
source_sync_info);
- if (ret == 0) {
- bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info));
- }
+ bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info));
}
}
}
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 99f7db4f569..88f5f7a9c52 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -1428,6 +1428,7 @@ struct RGWBucketEnt {
size_t size;
size_t size_rounded;
ceph::real_time creation_time;
+ ceph::real_time modification_time;
uint64_t count;
/* The placement_rule is necessary to calculate per-storage-policy statics
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 7b0ca3134a3..1793c0b8065 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -8582,6 +8582,10 @@ void RGWGetBucketPolicy::execute(optional_yield y)
void RGWDeleteBucketPolicy::send_response()
{
+ if (!op_ret) {
+ /* A successful Delete Bucket Policy should return a 204 on success */
+ op_ret = STATUS_NO_CONTENT;
+ }
if (op_ret) {
set_req_state_err(s, op_ret);
}
@@ -9257,4 +9261,3 @@ void rgw_slo_entry::decode_json(JSONObj *obj)
JSONDecoder::decode_json("etag", etag, obj);
JSONDecoder::decode_json("size_bytes", size_bytes, obj);
};
-
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index b8ff3ca2fe8..88af0fc9c27 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -361,6 +361,7 @@ void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& ent)
if (need_stats) {
s->formatter->dump_int("count", ent.count);
s->formatter->dump_int("bytes", ent.size);
+ dump_time(s, "last_modified", ent.modification_time);
}
s->formatter->close_section();
diff --git a/src/rgw/services/svc_bucket_sobj.cc b/src/rgw/services/svc_bucket_sobj.cc
index ca705c5a44d..0f4cd4e847b 100644
--- a/src/rgw/services/svc_bucket_sobj.cc
+++ b/src/rgw/services/svc_bucket_sobj.cc
@@ -556,7 +556,7 @@ int RGWSI_Bucket_SObj::read_bucket_stats(const rgw_bucket& bucket,
const DoutPrefixProvider *dpp)
{
RGWBucketInfo bucket_info;
- int ret = read_bucket_info(bucket, &bucket_info, nullptr, nullptr, boost::none, y, dpp);
+ int ret = read_bucket_info(bucket, &bucket_info, &ent->modification_time, nullptr, boost::none, y, dpp);
if (ret < 0) {
return ret;
}
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index 60147b5397c..32173d61afe 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -1426,6 +1426,87 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) {
}
}
+TEST(BlueFS, truncate_drops_allocations) {
+ constexpr uint64_t K = 1024;
+ constexpr uint64_t M = 1024 * K;
+ uuid_d fsid;
+ const char* DIR_NAME="dir";
+ const char* FILE_NAME="file1";
+ struct {
+ uint64_t preallocated_size;
+ uint64_t write_size;
+ uint64_t truncate_to;
+ uint64_t allocated_after_truncate;
+ uint64_t slow_size = 0;
+ uint64_t slow_alloc_size = 64*K;
+ uint64_t db_size = 128*M;
+ uint64_t db_alloc_size = 1*M;
+ } scenarios [] = {
+ // on DB(which is SLOW) : 1 => 1, 64K remains
+ { 1*M, 1, 1, 64*K },
+ // on DB(which is SLOW), alloc 4K : 1 => 1, 4K remains
+ { 1*M, 1, 1, 4*K, 0, 4*K },
+ // on DB(which is SLOW), truncation on AU boundary : 128K => 128K, 128K remains
+ { 1*M, 128*K, 128*K, 128*K },
+ // on DB(which is SLOW), no prealloc, truncation to 0 : 1666K => 0, 0 remains
+ { 0, 1666*K, 0, 0 },
+ // on DB, truncate to 123K, expect 1M occupied
+ { 1234*K, 123*K, 123*K, 1*M, 128*M, 64*K, 10*M, 1*M },
+ // on DB, truncate to 0, expect 0 occupied
+ { 1234*K, 345*K, 0, 0, 128*M, 64*K, 10*M, 1*M },
+ // on DB, truncate to AU boundary, expect exactly 1M occupied
+ { 1234*K, 1123*K, 1*M, 1*M, 128*M, 64*K, 10*M, 1*M },
+ // on DB and SLOW, truncate only data on SLOW
+ { 0, 10*M+1, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M },
+ // on DB and SLOW, preallocate and truncate only data on SLOW
+ { 6*M, 12*M, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M },
+ // on DB and SLOW, preallocate and truncate all in SLOW and some on DB
+ // note! prealloc 6M is important, one allocation for 12M will fallback to SLOW
+ // in 6M + 6M we can be sure that 6M is on DB and 6M is on SLOW
+ { 6*M, 12*M, 3*M+1, 4*M, 128*M, 64*K, 11*M, 1*M },
+ };
+ for (auto& s : scenarios) {
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_shared_alloc_size", stringify(s.slow_alloc_size).c_str());
+ conf.SetVal("bluefs_alloc_size", stringify(s.db_alloc_size).c_str());
+
+ g_ceph_context->_conf.set_val("bluefs_shared_alloc_size", stringify(s.slow_alloc_size));
+ g_ceph_context->_conf.set_val("bluefs_alloc_size", stringify(s.db_alloc_size));
+ TempBdev bdev_db{s.db_size};
+ TempBdev bdev_slow{s.slow_size};
+
+ BlueFS fs(g_ceph_context);
+ if (s.db_size != 0) {
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0));
+ }
+ if (s.slow_size != 0) {
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0));
+ }
+
+ ASSERT_EQ(0, fs.mkfs(fsid, {BlueFS::BDEV_DB, false, false}));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({BlueFS::BDEV_DB, false, false}));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false));
+ uint64_t pre = fs.get_used();
+ ASSERT_EQ(0, fs.preallocate(h->file, 0, s.preallocated_size));
+ const std::string content(s.write_size, 'x');
+ h->append(content.c_str(), content.length());
+ fs.fsync(h);
+ ASSERT_EQ(0, fs.truncate(h, s.truncate_to));
+ fs.fsync(h);
+ uint64_t post = fs.get_used();
+ fs.close_writer(h);
+ EXPECT_EQ(pre, post - s.allocated_after_truncate);
+
+ fs.umount();
+ }
+}
+
+
+
+
TEST(BlueFS, test_log_runway) {
uint64_t max_log_runway = 65536;
ConfSaver conf(g_ceph_context->_conf);
diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py
index 25423bd8dcb..881b29c9152 100644
--- a/src/test/pybind/test_rados.py
+++ b/src/test/pybind/test_rados.py
@@ -516,6 +516,11 @@ class TestIoctx(object):
eq(self.ioctx.read('write_ops'), b'12\x00\x005')
write_op.write_full(b'12345')
+ write_op.zero(0, 2)
+ self.ioctx.operate_write_op(write_op, "write_ops")
+ eq(self.ioctx.read('write_ops'), b'\x00\x00345')
+
+ write_op.write_full(b'12345')
write_op.truncate(2)
self.ioctx.operate_write_op(write_op, "write_ops")
eq(self.ioctx.read('write_ops'), b'12')