summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAdam Kupczyk <akupczyk@ibm.com>2025-01-17 09:31:32 +0100
committerGitHub <noreply@github.com>2025-01-17 09:31:32 +0100
commit11086d8a1ffdd294bb574f18a8ea03a327a6ba0e (patch)
tree28d23a779a3f0d75cdae62c81c377bad2c2badd0
parentMerge pull request #60363 from aclamk/wip-aclamk-fix-bluefs-bdev-expand (diff)
parentos/bluestore: Add unittest for BlueFS::truncate() (diff)
downloadceph-11086d8a1ffdd294bb574f18a8ea03a327a6ba0e.tar.xz
ceph-11086d8a1ffdd294bb574f18a8ea03a327a6ba0e.zip
Merge pull request #60556 from aclamk/wip-aclamk-bluefs-truncate-allocations-main
os/bluestore: Make truncate() drop unused allocations - addendum
-rw-r--r--src/os/bluestore/BlueFS.cc60
-rw-r--r--src/test/objectstore/test_bluefs.cc81
2 files changed, 110 insertions, 31 deletions
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 2f88acdc93b..50f293d45fd 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -3794,7 +3794,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
if (offset > fnode.size) {
ceph_abort_msg("truncate up not supported");
}
- ceph_assert(offset <= fnode.size);
+
_flush_bdev(h);
{
std::lock_guard ll(log.lock);
@@ -3803,44 +3803,42 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
vselector->sub_usage(h->file->vselector_hint, fnode);
uint64_t x_off = 0;
auto p = fnode.seek(offset, &x_off);
- uint64_t cut_off =
- (p == fnode.extents.end()) ? 0 : p2roundup(x_off, alloc_size[p->bdev]);
- uint64_t new_allocated;
- if (0 == cut_off) {
- // whole pextent to remove
- changed_extents = true;
- new_allocated = offset;
- } else if (cut_off < p->length) {
- dirty.pending_release[p->bdev].insert(p->offset + cut_off, p->length - cut_off);
- new_allocated = (offset - x_off) + cut_off;
- p->length = cut_off;
- changed_extents = true;
- ++p;
- } else {
- ceph_assert(cut_off >= p->length);
- new_allocated = (offset - x_off) + p->length;
- // just leave it here
- ++p;
- }
- while (p != fnode.extents.end()) {
- dirty.pending_release[p->bdev].insert(p->offset, p->length);
- p = fnode.extents.erase(p);
- changed_extents = true;
+ if (p != fnode.extents.end()) {
+ uint64_t cut_off = p2roundup(x_off, alloc_size[p->bdev]);
+ if (0 == cut_off) {
+ // whole pextent to remove
+ fnode.allocated = offset;
+ changed_extents = true;
+ } else if (cut_off < p->length) {
+ dirty.pending_release[p->bdev].insert(p->offset + cut_off,
+ p->length - cut_off);
+ fnode.allocated = (offset - x_off) + cut_off;
+ p->length = cut_off;
+ changed_extents = true;
+ ++p;
+ } else {
+ // cut_off > p->length means that we misaligned the extent
+ ceph_assert(cut_off == p->length);
+ fnode.allocated = (offset - x_off) + p->length;
+ ++p; // leave extent untouched
+ }
+ while (p != fnode.extents.end()) {
+ dirty.pending_release[p->bdev].insert(p->offset, p->length);
+ p = fnode.extents.erase(p);
+ changed_extents = true;
+ }
}
if (changed_extents) {
fnode.size = offset;
- fnode.allocated = new_allocated;
fnode.reset_delta();
fnode.recalc_allocated();
log.t.op_file_update(fnode);
// sad, but is_dirty must be set to signal flushing of the log
h->file->is_dirty = true;
- } else {
- if (offset != fnode.size) {
- fnode.size = offset;
- //skipping log.t.op_file_update_inc, it will be done by flush()
- h->file->is_dirty = true;
- }
+ } else if (offset != fnode.size) {
+ fnode.size = offset;
+ // skipping log.t.op_file_update_inc, it will be done by flush()
+ h->file->is_dirty = true;
}
vselector->add_usage(h->file->vselector_hint, fnode);
}
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index 60147b5397c..32173d61afe 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -1426,6 +1426,87 @@ TEST(BlueFS, test_concurrent_dir_link_and_compact_log_56210) {
}
}
+TEST(BlueFS, truncate_drops_allocations) {
+ constexpr uint64_t K = 1024;
+ constexpr uint64_t M = 1024 * K;
+ uuid_d fsid;
+ const char* DIR_NAME="dir";
+ const char* FILE_NAME="file1";
+ struct {
+ uint64_t preallocated_size;
+ uint64_t write_size;
+ uint64_t truncate_to;
+ uint64_t allocated_after_truncate;
+ uint64_t slow_size = 0;
+ uint64_t slow_alloc_size = 64*K;
+ uint64_t db_size = 128*M;
+ uint64_t db_alloc_size = 1*M;
+ } scenarios [] = {
+ // on DB(which is SLOW) : 1 => 1, 64K remains
+ { 1*M, 1, 1, 64*K },
+ // on DB(which is SLOW), alloc 4K : 1 => 1, 4K remains
+ { 1*M, 1, 1, 4*K, 0, 4*K },
+ // on DB(which is SLOW), truncation on AU boundary : 128K => 128K, 128K remains
+ { 1*M, 128*K, 128*K, 128*K },
+ // on DB(which is SLOW), no prealloc, truncation to 0 : 1666K => 0, 0 remains
+ { 0, 1666*K, 0, 0 },
+ // on DB, truncate to 123K, expect 1M occupied
+ { 1234*K, 123*K, 123*K, 1*M, 128*M, 64*K, 10*M, 1*M },
+ // on DB, truncate to 0, expect 0 occupied
+ { 1234*K, 345*K, 0, 0, 128*M, 64*K, 10*M, 1*M },
+ // on DB, truncate to AU boundary, expect exactly 1M occupied
+ { 1234*K, 1123*K, 1*M, 1*M, 128*M, 64*K, 10*M, 1*M },
+ // on DB and SLOW, truncate only data on SLOW
+ { 0, 10*M+1, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M },
+ // on DB and SLOW, preallocate and truncate only data on SLOW
+ { 6*M, 12*M, 10*M+1, 10*M+64*K, 128*M, 64*K, 10*M, 1*M },
+ // on DB and SLOW, preallocate and truncate all in SLOW and some on DB
+ // note! prealloc 6M is important, one allocation for 12M will fallback to SLOW
+ // in 6M + 6M we can be sure that 6M is on DB and 6M is on SLOW
+ { 6*M, 12*M, 3*M+1, 4*M, 128*M, 64*K, 11*M, 1*M },
+ };
+ for (auto& s : scenarios) {
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_shared_alloc_size", stringify(s.slow_alloc_size).c_str());
+ conf.SetVal("bluefs_alloc_size", stringify(s.db_alloc_size).c_str());
+
+ g_ceph_context->_conf.set_val("bluefs_shared_alloc_size", stringify(s.slow_alloc_size));
+ g_ceph_context->_conf.set_val("bluefs_alloc_size", stringify(s.db_alloc_size));
+ TempBdev bdev_db{s.db_size};
+ TempBdev bdev_slow{s.slow_size};
+
+ BlueFS fs(g_ceph_context);
+ if (s.db_size != 0) {
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0));
+ }
+ if (s.slow_size != 0) {
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0));
+ }
+
+ ASSERT_EQ(0, fs.mkfs(fsid, {BlueFS::BDEV_DB, false, false}));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({BlueFS::BDEV_DB, false, false}));
+ BlueFS::FileWriter *h;
+ ASSERT_EQ(0, fs.mkdir("dir"));
+ ASSERT_EQ(0, fs.open_for_write(DIR_NAME, FILE_NAME, &h, false));
+ uint64_t pre = fs.get_used();
+ ASSERT_EQ(0, fs.preallocate(h->file, 0, s.preallocated_size));
+ const std::string content(s.write_size, 'x');
+ h->append(content.c_str(), content.length());
+ fs.fsync(h);
+ ASSERT_EQ(0, fs.truncate(h, s.truncate_to));
+ fs.fsync(h);
+ uint64_t post = fs.get_used();
+ fs.close_writer(h);
+ EXPECT_EQ(pre, post - s.allocated_after_truncate);
+
+ fs.umount();
+ }
+}
+
+
+
+
TEST(BlueFS, test_log_runway) {
uint64_t max_log_runway = 65536;
ConfSaver conf(g_ceph_context->_conf);