diff options
Diffstat (limited to 'src')
23 files changed, 229 insertions, 57 deletions
diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 5874a3dce56..e66b5aa08c7 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4167,7 +4167,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup) if (r < 0) { // fall-thru. let rdlock_path_pin_ref() check again. - } else if (is_lookup) { + } else if (is_lookup && mdr->dn[0].size()) { CDentry* dn = mdr->dn[0].back(); mdr->pin(dn); auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple()); @@ -4274,7 +4274,7 @@ void Server::handle_client_getattr(const MDRequestRef& mdr, bool is_lookup) // reply dout(10) << "reply to stat on " << *req << dendl; mdr->tracei = ref; - if (is_lookup) + if (is_lookup && mdr->dn[0].size()) mdr->tracedn = mdr->dn[0].back(); respond_to_request(mdr, 0); } diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 7332ec3edb1..833bdddc71b 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -4024,7 +4024,7 @@ void Monitor::handle_command(MonOpRequestRef op) for (auto& p : mgrstatmon()->get_service_map().services) { auto &service = p.first; - if (ServiceMap::is_normal_ceph_entity(service)) { + if (ServiceMap::is_normal_ceph_entity(service) || service == "nvmeof") { continue; } f->open_object_section(service.c_str()); diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc index 7ed5321dcda..eb04e3b8e98 100644 --- a/src/msg/async/EventEpoll.cc +++ b/src/msg/async/EventEpoll.cc @@ -17,6 +17,7 @@ #include "common/errno.h" #include <fcntl.h> #include "EventEpoll.h" +#include "Timeout.h" #define dout_subsys ceph_subsys_ms @@ -120,8 +121,7 @@ int EpollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct ti { int retval, numevents = 0; - retval = epoll_wait(epfd, events, nevent, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = epoll_wait(epfd, events, nevent, timeout_to_milliseconds(tvp)); if (retval > 0) { numevents = retval; fired_events.resize(numevents); diff --git a/src/msg/async/EventPoll.cc b/src/msg/async/EventPoll.cc index 4c09dbb4db4..f46528715e3 100644 --- a/src/msg/async/EventPoll.cc +++ b/src/msg/async/EventPoll.cc @@ -15,6 +15,7 @@ #include "common/errno.h" #include "EventPoll.h" +#include "Timeout.h" #include <unistd.h> #define dout_subsys ceph_subsys_ms @@ -161,11 +162,9 @@ int PollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tvp) { int retval, numevents = 0; #ifdef _WIN32 - retval = WSAPoll(pfds, max_pfds, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = WSAPoll(pfds, max_pfds, timeout_to_milliseconds(tvp)); #else - retval = poll(pfds, max_pfds, - tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + retval = poll(pfds, max_pfds, timeout_to_milliseconds(tvp)); #endif if (retval > 0) { for (int j = 0; j < max_pfds; j++) { diff --git a/src/msg/async/Timeout.h b/src/msg/async/Timeout.h new file mode 100644 index 00000000000..b8df1b40761 --- /dev/null +++ b/src/msg/async/Timeout.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2024 IONOS SE + * + * Author: Max Kellermann <max.kellermann@ionos.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_TIMEOUT_H +#define CEPH_MSG_TIMEOUT_H + +#include "include/intarith.h" // for div_round_up() + +#include <time.h> // for struct timeval + +/** + * Convert the given `struct timeval` to milliseconds. + * + * This is supposed to be used as timeout parameter to system calls + * such as poll() and epoll_wait(). + */ +constexpr int +timeout_to_milliseconds(const struct timeval &tv) noexcept +{ + /* round up to the next millisecond so we don't wake up too early */ + return tv.tv_sec * 1000 + div_round_up(tv.tv_usec, 1000); +} + +/** + * This overload makes the timeout optional; on nullptr, it returns + * -1. + */ +constexpr int +timeout_to_milliseconds(const struct timeval *tv) noexcept +{ + return tv != nullptr ? timeout_to_milliseconds(*tv) : -1; +} + +#endif diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 2f88acdc93b..50f293d45fd 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -3794,7 +3794,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ if (offset > fnode.size) { ceph_abort_msg("truncate up not supported"); } - ceph_assert(offset <= fnode.size); + _flush_bdev(h); { std::lock_guard ll(log.lock); @@ -3803,44 +3803,42 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ vselector->sub_usage(h->file->vselector_hint, fnode); uint64_t x_off = 0; auto p = fnode.seek(offset, &x_off); - uint64_t cut_off = - (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]); - uint64_t new_allocated; - if (0 == cut_off) { - // whole pextent to remove - changed_extents = true; - new_allocated = offset; - } else if (cut_off < p->length) { - dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off); - new_allocated = (offset - x_off) + cut_off; - p->length = cut_off; - changed_extents = true; - ++p; - } else { - ceph_assert(cut_off >= p->length); - new_allocated = (offset - x_off) + p->length; - // just leave it here - ++p; - } - while (p != fnode.extents.end()) { - dirty.pending_release[p->bdev].insert(p->offset, p->length); - p = fnode.extents.erase(p); - changed_extents = true; + if (p != fnode.extents.end()) { + uint64_t cut_off = p2roundup(x_off, alloc_size[p->bdev]); + if (0 == cut_off) { + // whole pextent to remove + fnode.allocated = offset; + changed_extents = true; + } else if (cut_off < p->length) { + dirty.pending_release[p->bdev].insert(p->offset + cut_off, + p->length - cut_off); + fnode.allocated = (offset - x_off) + cut_off; + p->length = cut_off; + changed_extents = true; + ++p; + } else { + // cut_off > p->length means that we misaligned the extent + ceph_assert(cut_off == p->length); + fnode.allocated = (offset - x_off) + p->length; + ++p; // leave extent untouched + } + while (p != fnode.extents.end()) { + dirty.pending_release[p->bdev].insert(p->offset, p->length); + p = fnode.extents.erase(p); + changed_extents = true; + } } if (changed_extents) { fnode.size = offset; - fnode.allocated = new_allocated; fnode.reset_delta(); fnode.recalc_allocated(); log.t.op_file_update(fnode); // sad, but is_dirty must be set to signal flushing of the log h->file->is_dirty = true; - } else { - if (offset != fnode.size) { - fnode.size = offset; - //skipping log.t.op_file_update_inc, it will be done by flush() - h->file->is_dirty = true; - } + } else if (offset != fnode.size) { + fnode.size = offset; + // skipping log.t.op_file_update_inc, it will be done by flush() + h->file->is_dirty = true; } vselector->add_usage(h->file->vselector_hint, fnode); } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 25e6c4fe596..8f1d995fa8d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6930,8 +6930,19 @@ int BlueStore::_check_main_bdev_label() return -EIO; } if (bluestore_bdev_label_require_all && r != 0) { - derr << __func__ << " not all labels read properly" << dendl; - return -EIO; + // We are about to complain that some labels failed. + // But in case if we expanded block device some labels will not be good. + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + uint32_t valid_locations = 0; + for (uint64_t loc : bdev_label_positions) { + if (loc + lsize <= bdev_label.size) { + ++valid_locations; + } + } + if (valid_locations != bdev_label_valid_locations.size()) { + derr << __func__ << " not all labels read properly" << dendl; + return -EIO; + } } return 0; } @@ -8967,11 +8978,25 @@ int BlueStore::expand_devices(ostream& out) _close_db_and_around(); // mount in read/write to sync expansion changes + if (bdev_label_multi) { + // We need not do fsck, because we can be broken - size is increased, + // but we might not have labels set. + cct->_conf.set_val_or_die("bluestore_fsck_on_mount", "false"); + } r = _mount(); ceph_assert(r == 0); if (fm && fm->is_null_manager()) { // we grow the allocation range, must reflect it in the allocation file alloc->init_add_free(size0, size - size0); + if (bdev_label_multi) { + uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size); + for (uint64_t loc : bdev_label_positions) { + if ((loc >= size0) && (loc + lsize <= size)) { + bdev_label_valid_locations.push_back(loc); + } + } + _write_bdev_label(cct, bdev, path + "/block", bdev_label, bdev_label_valid_locations); + } need_to_destage_allocation_file = true; } umount(); diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index b3fd526815e..8acec94f382 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -47,6 +47,7 @@ class NvmeofService(CephService): # TODO: check if we can force jinja2 to generate dicts with double quotes instead of using json.dumps transport_tcp_options = json.dumps(spec.transport_tcp_options) if spec.transport_tcp_options else None + iobuf_options = json.dumps(spec.iobuf_options) if spec.iobuf_options else None name = '{}.{}'.format(utils.name_to_config_section('nvmeof'), nvmeof_gw_id) rados_id = name[len('client.'):] if name.startswith('client.') else name @@ -67,6 +68,7 @@ class NvmeofService(CephService): 'rpc_socket_dir': '/var/tmp/', 'rpc_socket_name': 'spdk.sock', 'transport_tcp_options': transport_tcp_options, + 'iobuf_options': iobuf_options, 'rados_id': rados_id } gw_conf = self.mgr.template.render('services/nvmeof/ceph-nvmeof.conf.j2', context) diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 index 37f2db52732..2a9ab309568 100644 --- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 @@ -86,6 +86,9 @@ transport_tcp_options = {{ transport_tcp_options }} {% if spec.tgt_cmd_extra_args %} tgt_cmd_extra_args = {{ spec.tgt_cmd_extra_args }} {% endif %} +{% if iobuf_options %} +iobuf_options = {{ iobuf_options }} +{% endif %} [monitor] timeout = {{ spec.monitor_timeout }} diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx index b54ebb483c6..bcfa6777f3d 100644 --- a/src/pybind/rados/rados.pyx +++ b/src/pybind/rados/rados.pyx @@ -1870,7 +1870,7 @@ cdef class WriteOp(object): uint64_t _offset = offset with nogil: - rados_write_op_zero(self.write_op, _length, _offset) + rados_write_op_zero(self.write_op, _offset, _length) def truncate(self, offset: int): """ diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 1ac9fa49e32..6869d5b2188 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1384,6 +1384,7 @@ class NvmeofServiceSpec(ServiceSpec): transport_tcp_options: Optional[Dict[str, int]] = {"in_capsule_data_size": 8192, "max_io_qpairs_per_ctrlr": 7}, tgt_cmd_extra_args: Optional[str] = None, + iobuf_options: Optional[Dict[str, int]] = None, discovery_addr: Optional[str] = None, discovery_addr_map: Optional[Dict[str, str]] = None, discovery_port: Optional[int] = None, @@ -1520,6 +1521,8 @@ class NvmeofServiceSpec(ServiceSpec): self.transport_tcp_options: Optional[Dict[str, int]] = transport_tcp_options #: ``tgt_cmd_extra_args`` extra arguments for the nvmf_tgt process self.tgt_cmd_extra_args = tgt_cmd_extra_args + #: List of extra arguments for SPDK iobuf in the form opt=value + self.iobuf_options: Optional[Dict[str, int]] = iobuf_options #: ``discovery_addr`` address of the discovery service self.discovery_addr = discovery_addr #: ``discovery_addr_map`` per node address map of the discovery service diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc index c0a9059a251..1302f278f59 100644 --- a/src/rgw/driver/rados/rgw_data_sync.cc +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -3021,7 +3021,7 @@ public: if (!dest_bucket_perms.verify_bucket_permission(dest_key.value_or(key), rgw::IAM::s3PutObject)) { ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl; - return -EPERM; + return set_cr_error(-EPERM); } } @@ -4520,7 +4520,7 @@ public: } tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key)); } - if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM) { + if (retcode == -ERR_PRECONDITION_FAILED || retcode == -EPERM || retcode == -EACCES) { pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n", bs.bucket.name, key, zone_name); set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)"); diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc index 69075c506f1..a183feabe2a 100644 --- a/src/rgw/driver/rados/rgw_rados.cc +++ b/src/rgw/driver/rados/rgw_rados.cc @@ -8951,7 +8951,7 @@ int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, return r; } - auto iter = attrset.find(RGW_ATTR_OLH_VER); + auto iter = attrset.find(RGW_ATTR_OLH_INFO); if (iter == attrset.end()) { /* not an olh */ return -EINVAL; } diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc index 9315dfc0afd..72216a471b3 100644 --- a/src/rgw/driver/rados/rgw_rest_log.cc +++ b/src/rgw/driver/rados/rgw_rest_log.cc @@ -1061,7 +1061,7 @@ void RGWOp_BILog_Status::execute(optional_yield y) if (!pipe.dest.bucket) { /* Uh oh, something went wrong */ - ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; + ldpp_dout(this, 0) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl; op_ret = -EIO; return; } diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc index 894d8e40950..cce593c6bd5 100644 --- a/src/rgw/driver/rados/rgw_user.cc +++ b/src/rgw/driver/rados/rgw_user.cc @@ -189,6 +189,11 @@ static void dump_user_info(Formatter *f, RGWUserInfo &info, } encode_json("type", user_source_type, f); encode_json("mfa_ids", info.mfa_ids, f); + encode_json("account_id", info.account_id, f); + encode_json("path", info.path, f); + encode_json("create_date", info.create_date, f); + encode_json("tags", info.tags, f); + encode_json("group_ids", info.group_ids, f); if (stats) { encode_json("stats", *stats, f); } diff --git a/src/rgw/radosgw-admin/radosgw-admin.cc b/src/rgw/radosgw-admin/radosgw-admin.cc index 47b68d3f902..13936c87952 100644 --- a/src/rgw/radosgw-admin/radosgw-admin.cc +++ b/src/rgw/radosgw-admin/radosgw-admin.cc @@ -2543,8 +2543,8 @@ static void sync_status(Formatter *formatter) struct indented { int w; // indent width - std::string_view header; - indented(int w, std::string_view header = "") : w(w), header(header) {} + std::string header; + indented(int w, std::string header = "") : w(w), header(header) {} }; std::ostream& operator<<(std::ostream& out, const indented& h) { return out << std::setw(h.w) << h.header << std::setw(1) << ' '; @@ -2552,10 +2552,10 @@ std::ostream& operator<<(std::ostream& out, const indented& h) { struct bucket_source_sync_info { const RGWZone& _source; - std::string_view error; + std::string error; std::map<int,std::string> shards_behind; int total_shards; - std::string_view status; + std::string status; rgw_bucket bucket_source; bucket_source_sync_info(const RGWZone& source): _source(source) {} @@ -3075,14 +3075,12 @@ static int bucket_sync_status(rgw::sal::Driver* driver, const RGWBucketInfo& inf } if (pipe.source.zone.value_or(rgw_zone_id()) == z->second.id) { bucket_source_sync_info source_sync_info(z->second); - auto ret = bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second, + bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second, c->second, info, pipe, source_sync_info); - if (ret == 0) { - bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info)); - } + bucket_sync_info.source_status_info.emplace_back(std::move(source_sync_info)); } } } diff --git a/src/rgw/radosgw-admin/sync_checkpoint.cc b/src/rgw/radosgw-admin/sync_checkpoint.cc index 0303ed6c747..97da99bdc27 100644 --- a/src/rgw/radosgw-admin/sync_checkpoint.cc +++ b/src/rgw/radosgw-admin/sync_checkpoint.cc @@ -228,6 +228,7 @@ int rgw_bucket_sync_checkpoint(const DoutPrefixProvider* dpp, } auto& entry = sources.emplace_back(); entry.pipe = pipe; + entry.pipe.dest.bucket = info.bucket; // so it contains the bucket key (+bucket id) // fetch remote markers boost::asio::spawn(ioctx, [&] (boost::asio::yield_context yield) { diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 99f7db4f569..88f5f7a9c52 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -1428,6 +1428,7 @@ struct RGWBucketEnt { size_t size; size_t size_rounded; ceph::real_time creation_time; + ceph::real_time modification_time; uint64_t count; /* The placement_rule is necessary to calculate per-storage-policy statics diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 9f25d275852..d6f846b0d2f 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -8587,6 +8587,10 @@ void RGWGetBucketPolicy::execute(optional_yield y) void RGWDeleteBucketPolicy::send_response() { + if (!op_ret) { + /* A successful Delete Bucket Policy should return a 204 on success */ + op_ret = STATUS_NO_CONTENT; + } if (op_ret) { set_req_state_err(s, op_ret); } @@ -9262,4 +9266,3 @@ void rgw_slo_entry::decode_json(JSONObj *obj) JSONDecoder::decode_json("etag", etag, obj); JSONDecoder::decode_json("size_bytes", size_bytes, obj); }; - diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc index b8ff3ca2fe8..88af0fc9c27 100644 --- a/src/rgw/rgw_rest_swift.cc +++ b/src/rgw/rgw_rest_swift.cc @@ -361,6 +361,7 @@ void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const RGWBucketEnt& ent) if (need_stats) { s->formatter->dump_int("count", ent.count); s->formatter->dump_int("bytes", ent.size); + dump_time(s, "last_modified", ent.modification_time); } s->formatter->close_section(); diff --git a/src/rgw/services/svc_bucket_sobj.cc b/src/rgw/services/svc_bucket_sobj.cc index ca705c5a44d..0f4cd4e847b 100644 --- a/src/rgw/services/svc_bucket_sobj.cc +++ b/src/rgw/services/svc_bucket_sobj.cc @@ -556,7 +556,7 @@ int RGWSI_Bucket_SObj::read_bucket_stats(const rgw_bucket& bucket, const DoutPrefixProvider *dpp) { RGWBucketInfo bucket_info; - int ret = read_bucket_info(bucket, &bucket_info, nullptr, nullptr, boost::none, y, dpp); + int ret = read_bucket_info(bucket, &bucket_info, &ent->modification_time, nullptr, boost::none, y, dpp); if (ret < 0) { return ret; } diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 60147b5397c..32173d61afe 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -1426,6 +1426,87 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) { } } +TEST(BlueFS, truncate_drops_allocations) { + constexpr uint64_t K = 1024; + constexpr uint64_t M = 1024 * K; + uuid_d fsid; + const char* DIR_NAME="dir"; + const char* FILE_NAME="file1"; + struct { + uint64_t preallocated_size; + uint64_t write_size; + uint64_t truncate_to; + uint64_t allocated_after_truncate; + uint64_t slow_size = 0; + uint64_t slow_alloc_size = 64*K; + uint64_t db_size = 128*M; + uint64_t db_alloc_size = 1*M; + } scenarios [] = { + // on DB(which is SLOW) : 1 => 1, 64K remains + { 1*M, 1, 1, 64*K }, + // on DB(which is SLOW), alloc 4K : 1 => 1, 4K remains + { 1*M, 1, 1, 4*K, 0, 4*K }, + // on DB(which is SLOW), truncation on AU boundary : 128K => 128K, 128K remains + { 1*M, 128*K, 128*K, 128*K }, + // on DB(which is SLOW), no prealloc, truncation to 0 : 1666K => 0, 0 remains + { 0, 1666*K, 0, 0 }, + // on DB, truncate to 123K, expect 1M occupied + { 1234*K, 123*K, 123*K, 1*M, 128*M, 64*K, 10*M, 1*M }, + // on DB, truncate to 0, expect 0 occupied + { 1234*K, 345*K, 0, 0, 128*M, 64*K, 10*M, 1*M }, + // on DB, truncate to AU boundary, expect exactly 1M occupied + { 1234*K, 1123*K, 1*M, 1*M, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, truncate only data on SLOW + { 0, 10*M+1, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, preallocate and truncate only data on SLOW + { 6*M, 12*M, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M }, + // on DB and SLOW, preallocate and truncate all in SLOW and some on DB + // note! prealloc 6M is important, one allocation for 12M will fallback to SLOW + // in 6M + 6M we can be sure that 6M is on DB and 6M is on SLOW + { 6*M, 12*M, 3*M+1, 4*M, 128*M, 64*K, 11*M, 1*M }, + }; + for (auto& s : scenarios) { + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_shared_alloc_size", stringify(s.slow_alloc_size).c_str()); + conf.SetVal("bluefs_alloc_size", stringify(s.db_alloc_size).c_str()); + + g_ceph_context->_conf.set_val("bluefs_shared_alloc_size", stringify(s.slow_alloc_size)); + g_ceph_context->_conf.set_val("bluefs_alloc_size", stringify(s.db_alloc_size)); + TempBdev bdev_db{s.db_size}; + TempBdev bdev_slow{s.slow_size}; + + BlueFS fs(g_ceph_context); + if (s.db_size != 0) { + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0)); + } + if (s.slow_size != 0) { + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0)); + } + + ASSERT_EQ(0, fs.mkfs(fsid, {BlueFS::BDEV_DB, false, false})); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({BlueFS::BDEV_DB, false, false})); + BlueFS::FileWriter *h; + ASSERT_EQ(0, fs.mkdir("dir")); + ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false)); + uint64_t pre = fs.get_used(); + ASSERT_EQ(0, fs.preallocate(h->file, 0, s.preallocated_size)); + const std::string content(s.write_size, 'x'); + h->append(content.c_str(), content.length()); + fs.fsync(h); + ASSERT_EQ(0, fs.truncate(h, s.truncate_to)); + fs.fsync(h); + uint64_t post = fs.get_used(); + fs.close_writer(h); + EXPECT_EQ(pre, post - s.allocated_after_truncate); + + fs.umount(); + } +} + + + + TEST(BlueFS, test_log_runway) { uint64_t max_log_runway = 65536; ConfSaver conf(g_ceph_context->_conf); diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py index 25423bd8dcb..881b29c9152 100644 --- a/src/test/pybind/test_rados.py +++ b/src/test/pybind/test_rados.py @@ -516,6 +516,11 @@ class TestIoctx(object): eq(self.ioctx.read('write_ops'), b'12\x00\x005') write_op.write_full(b'12345') + write_op.zero(0, 2) + self.ioctx.operate_write_op(write_op, "write_ops") + eq(self.ioctx.read('write_ops'), b'\x00\x00345') + + write_op.write_full(b'12345') write_op.truncate(2) self.ioctx.operate_write_op(write_op, "write_ops") eq(self.ioctx.read('write_ops'), b'12') |