summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2021-09-07 20:53:06 +0200
committerSage Weil <sage@newdream.net>2021-10-29 15:55:57 +0200
commit7f74551b7bdf13afb0f0d31b18bdafff91df3d82 (patch)
treede4ee683eaf04b2faa736d0fd6af1d91fe4bdfad
parentblk/zoned: add get_zones() to fetch write pointers (diff)
downloadceph-7f74551b7bdf13afb0f0d31b18bdafff91df3d82.tar.xz
ceph-7f74551b7bdf13afb0f0d31b18bdafff91df3d82.zip
os/bluestore: fix startup vs device write pointers
Compare freelist write pointers to the device on startup. Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r--src/os/bluestore/BlueStore.cc31
-rw-r--r--src/os/bluestore/ZonedAllocator.cc2
-rw-r--r--src/os/bluestore/ZonedAllocator.h2
-rw-r--r--src/os/bluestore/zoned_types.h4
4 files changed, 34 insertions, 5 deletions
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 2004f728645..459f7070b1b 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -5592,7 +5592,36 @@ int BlueStore::_init_alloc()
ceph_assert(a);
auto f = dynamic_cast<ZonedFreelistManager*>(fm);
ceph_assert(f);
- a->init_from_zone_pointers(f->get_zone_states(db),
+ vector<uint64_t> wp = bdev->get_zones();
+ vector<zone_state_t> zones = f->get_zone_states(db);
+ ceph_assert(wp.size() == zones.size());
+
+ // reconcile zone state
+ auto num_zones = bdev->get_size() / zone_size;
+ for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
+ ceph_assert(wp[i] >= i * zone_size);
+ ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
+ uint64_t p = wp[i] - i * zone_size;
+ if (zones[i].write_pointer > p) {
+ derr << __func__ << " zone 0x" << std::hex << i
+ << " bluestore write pointer 0x" << zones[i].write_pointer
+ << " > device write pointer 0x" << p
+ << std::dec << dendl;
+ ceph_abort("bad write pointer");
+ } else if (zones[i].write_pointer < p) {
+ // this is "normal" in that it can happen after any crash (if we have a
+ // write in flight but did not manage to commit the transaction)
+ auto delta = p - zones[i].write_pointer;
+ dout(1) << __func__ << " zone 0x" << std::hex << i
+ << " device write pointer 0x" << p
+ << " > bluestore pointer 0x" << zones[i].write_pointer
+ << ", advancing 0x" << delta << std::dec << dendl;
+ zones[i].num_dead_bytes += delta;
+ zones[i].write_pointer = p;
+ }
+ }
+
+ a->init_from_zone_pointers(zones,
&zoned_cleaner_lock,
&zoned_cleaner_cond);
dout(1) << __func__
diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc
index 0ec8000e17b..03326d2e5fc 100644
--- a/src/os/bluestore/ZonedAllocator.cc
+++ b/src/os/bluestore/ZonedAllocator.cc
@@ -151,7 +151,7 @@ void ZonedAllocator::dump(std::function<void(uint64_t offset,
}
void ZonedAllocator::init_from_zone_pointers(
- std::vector<zone_state_t> &&_zone_states,
+ std::vector<zone_state_t> _zone_states,
ceph::mutex *_cleaner_lock,
ceph::condition_variable *_cleaner_cond)
{
diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h
index 73697e4cfd5..903b62fd1a0 100644
--- a/src/os/bluestore/ZonedAllocator.h
+++ b/src/os/bluestore/ZonedAllocator.h
@@ -96,7 +96,7 @@ public:
void mark_zones_to_clean_free(void);
void init_from_zone_pointers(
- std::vector<zone_state_t> &&_zone_states,
+ std::vector<zone_state_t> _zone_states,
ceph::mutex *_cleaner_lock,
ceph::condition_variable *_cleaner_cond);
void init_add_free(uint64_t offset, uint64_t length) override {}
diff --git a/src/os/bluestore/zoned_types.h b/src/os/bluestore/zoned_types.h
index c92f2ada099..93d3138399a 100644
--- a/src/os/bluestore/zoned_types.h
+++ b/src/os/bluestore/zoned_types.h
@@ -14,8 +14,8 @@
// We use the same struct for an on-disk and in-memory representation of the
// state.
struct zone_state_t {
- uint64_t num_dead_bytes = 0;
- uint64_t write_pointer = 0;
+ uint64_t num_dead_bytes = 0; ///< dead bytes deallocated (behind the write pointer)
+ uint64_t write_pointer = 0; ///< relative offset within the zone
void encode(ceph::buffer::list &bl) const {
using ceph::encode;