From 7f3601089d41bfc23f530c7bf3fb7efad2d055ec Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Fri, 10 Jan 2025 08:26:54 +0000 Subject: os/bluestore: Fix BlueFS::truncate() In `struct bluefs_fnode_t` there is a vector `extents` and the vector `extents_index` that is a log2 seek cache. Until modifications to truncate() we never removed extents from files. Modified truncate() did not update extents_index. For example 10 extents long files when truncated to 0 will have: 0 extents, 10 extents_index. After writing some data to file: 1 extents, 11 extents_index. Now, `bluefs_fnode_t::seek` will binary search extents_index, lets say it located seek at item #3. It will then jump up from #0 extent (that exists) to #3 extent which does not exist at. The worst part is that code is now broken, as #3 != extent.end(). There are 3 parts of the fix: 1) assert in `bluefs_fnode_t::seek` to protect against jumping outside extents 2) code in BlueFS::truncate to sync up `extents_index` with `extents` 3) dampening down assert in _replay to give a way out of cases where incorrect "offset 12345" (12345 is file size) instead of "offset 20000" (allocations occupied) was written to log. Fixes: https://tracker.ceph.com/issues/69481 Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueFS.cc | 4 +++- src/os/bluestore/bluefs_types.cc | 4 +++- src/os/bluestore/bluefs_types.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index e123a0a200a..2f88acdc93b 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1706,7 +1706,8 @@ int BlueFS::_replay(bool noop, bool to_stdout) << " fnode=" << fnode << " delta=" << delta << dendl; - ceph_assert(delta.offset == fnode.allocated); + // be leanient, if there is no extents just produce error message + ceph_assert(delta.offset == fnode.allocated || delta.extents.empty()); } if (cct->_conf->bluefs_log_replay_check_allocations) { int r = _check_allocations(fnode, @@ -3830,6 +3831,7 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/ fnode.size = offset; fnode.allocated = new_allocated; fnode.reset_delta(); + fnode.recalc_allocated(); log.t.op_file_update(fnode); // sad, but is_dirty must be set to signal flushing of the log h->file->is_dirty = true; diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index e18dd490140..fe77f7f74d8 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -154,7 +154,9 @@ mempool::bluefs::vector::iterator bluefs_fnode_t::seek( assert(it != extents_index.begin()); --it; assert(offset >= *it); - p += it - extents_index.begin(); + uint32_t skip = it - extents_index.begin(); + ceph_assert(skip <= extents.size()); + p += skip; offset -= *it; } diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 627118c12f8..08b3ca0cf41 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -89,6 +89,7 @@ struct bluefs_fnode_t { void recalc_allocated() { allocated = 0; extents_index.reserve(extents.size()); + extents_index.clear(); for (auto& p : extents) { extents_index.emplace_back(allocated); allocated += p.length; -- cgit v1.2.3