diff options
author | Sage Weil <sage@newdream.net> | 2021-09-08 18:37:26 +0200 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2021-10-29 15:55:57 +0200 |
commit | 59763171c5cd8a89600bfcbfcef1ec6a75d5cd37 (patch) | |
tree | fcd297a13395b0691135a291ca96ea4d7225d09b | |
parent | ceph_test_objectstore: add trivial fsck test (diff) | |
download | ceph-59763171c5cd8a89600bfcbfcef1ec6a75d5cd37.tar.xz ceph-59763171c5cd8a89600bfcbfcef1ec6a75d5cd37.zip |
os/bluestore: simplify zone to clean selection
Only pick one zone to clean based on the current. Since the best victim
may change (maybe another zone gets a bunch of releases and new dead
bytes!) there is no reason (yet) to explicitly avoid the victim zone
during allocation. There is also no need to track which zones we are
cleaning on disk because we can choose to clean from any zone at any time,
and in general want to clean from the best candidate at the time, not the
one that looked the best some time in the past.
Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r-- | src/blk/BlockDevice.h | 7 | ||||
-rw-r--r-- | src/blk/zoned/HMSMRDevice.cc | 13 | ||||
-rw-r--r-- | src/blk/zoned/HMSMRDevice.h | 2 | ||||
-rw-r--r-- | src/os/bluestore/BlueStore.cc | 45 | ||||
-rw-r--r-- | src/os/bluestore/ZonedAllocator.cc | 107 | ||||
-rw-r--r-- | src/os/bluestore/ZonedAllocator.h | 13 | ||||
-rw-r--r-- | src/os/bluestore/ZonedFreelistManager.cc | 44 | ||||
-rw-r--r-- | src/os/bluestore/ZonedFreelistManager.h | 9 |
8 files changed, 62 insertions, 178 deletions
diff --git a/src/blk/BlockDevice.h b/src/blk/BlockDevice.h index 14efcbea79f..01fa0d44512 100644 --- a/src/blk/BlockDevice.h +++ b/src/blk/BlockDevice.h @@ -205,11 +205,14 @@ public: ceph_assert(is_smr()); return conventional_region_size; } - virtual void reset_all_zones() {} - virtual void reset_zones(const std::set<uint64_t>& zones) { + virtual void reset_all_zones() { + ceph_assert(is_smr()); + } + virtual void reset_zone(uint64_t zone) { ceph_assert(is_smr()); } virtual std::vector<uint64_t> get_zones() { + ceph_assert(is_smr()); return std::vector<uint64_t>(); } diff --git a/src/blk/zoned/HMSMRDevice.cc b/src/blk/zoned/HMSMRDevice.cc index 31e197ea821..416eae4e49f 100644 --- a/src/blk/zoned/HMSMRDevice.cc +++ b/src/blk/zoned/HMSMRDevice.cc @@ -100,14 +100,13 @@ void HMSMRDevice::reset_all_zones() zbd_reset_zones(zbd_fd, conventional_region_size, 0); } -void HMSMRDevice::reset_zones(const std::set<uint64_t>& zones) +void HMSMRDevice::reset_zone(uint64_t zone) { - dout(10) << __func__ << " 0x" << std::hex << zones << std::dec << dendl; - for (auto zone_num : zones) { - if (zbd_reset_zones(zbd_fd, zone_num * zone_size, zone_size) != 0) { - derr << __func__ << " resetting zone failed for zone 0x" << std::hex - << zone_num << std::dec << dendl; - } + dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl; + if (zbd_reset_zones(zbd_fd, zone * zone_size, zone_size) != 0) { + derr << __func__ << " resetting zone failed for zone 0x" << std::hex + << zone << std::dec << dendl; + ceph_abort("zbd_reset_zones failed"); } } diff --git a/src/blk/zoned/HMSMRDevice.h b/src/blk/zoned/HMSMRDevice.h index cfa1310bef6..b6df3200c65 100644 --- a/src/blk/zoned/HMSMRDevice.h +++ b/src/blk/zoned/HMSMRDevice.h @@ -44,7 +44,7 @@ public: // smr-specific methods bool is_smr() const final { return true; } void reset_all_zones() override; - void reset_zones(const std::set<uint64_t>& zones) override; + void reset_zone(uint64_t zone) override; std::vector<uint64_t> get_zones() override; }; diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 169ecc22572..32f86bedc3d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5622,9 +5622,7 @@ int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments) } } - a->init_from_zone_pointers(zones, - &zoned_cleaner_lock, - &zoned_cleaner_cond); + a->init_from_zone_pointers(zones); dout(1) << __func__ << " loaded zone pointers: " << std::hex @@ -12840,26 +12838,14 @@ void BlueStore::_kv_finalize_thread() } #ifdef HAVE_LIBZBD -void BlueStore::_zoned_cleaner_start() { +void BlueStore::_zoned_cleaner_start() +{ dout(10) << __func__ << dendl; - - auto f = dynamic_cast<ZonedFreelistManager*>(fm); - ceph_assert(f); - - auto zones_to_clean = f->get_cleaning_in_progress_zones(db); - if (!zones_to_clean.empty()) { - dout(10) << __func__ << " resuming cleaning after unclean shutdown." << dendl; - for (auto zone_num : zones_to_clean) { - _zoned_clean_zone(zone_num); - } - bdev->reset_zones(zones_to_clean); - f->mark_zones_to_clean_free(zones_to_clean, db); - } - zoned_cleaner_thread.create("bstore_zcleaner"); } -void BlueStore::_zoned_cleaner_stop() { +void BlueStore::_zoned_cleaner_stop() +{ dout(10) << __func__ << dendl; { std::unique_lock l{zoned_cleaner_lock}; @@ -12877,7 +12863,8 @@ void BlueStore::_zoned_cleaner_stop() { dout(10) << __func__ << " done" << dendl; } -void BlueStore::_zoned_cleaner_thread() { +void BlueStore::_zoned_cleaner_thread() +{ dout(10) << __func__ << " start" << dendl; std::unique_lock l{zoned_cleaner_lock}; ceph_assert(!zoned_cleaner_started); @@ -12888,8 +12875,8 @@ void BlueStore::_zoned_cleaner_thread() { auto f = dynamic_cast<ZonedFreelistManager*>(fm); ceph_assert(f); while (true) { - const auto *zones_to_clean = a->get_zones_to_clean(); - if (!zones_to_clean) { + auto zone_to_clean = a->pick_zone_to_clean(); + if (zone_to_clean < 0) { if (zoned_cleaner_stop) { break; } @@ -12898,13 +12885,10 @@ void BlueStore::_zoned_cleaner_thread() { dout(20) << __func__ << " wake" << dendl; } else { l.unlock(); - f->mark_zones_to_clean_in_progress(*zones_to_clean, db); - for (auto zone_num : *zones_to_clean) { - _zoned_clean_zone(zone_num); - } - bdev->reset_zones(*zones_to_clean); - f->mark_zones_to_clean_free(*zones_to_clean, db); - a->mark_zones_to_clean_free(); + _zoned_clean_zone(zone_to_clean); + bdev->reset_zone(zone_to_clean); + f->mark_zone_to_clean_free(zone_to_clean, db); + //a->mark_zone_to_clean_free(); l.lock(); } } @@ -12912,7 +12896,8 @@ void BlueStore::_zoned_cleaner_thread() { zoned_cleaner_started = false; } -void BlueStore::_zoned_clean_zone(uint64_t zone_num) { +void BlueStore::_zoned_clean_zone(uint64_t zone_num) +{ dout(10) << __func__ << " cleaning zone " << zone_num << dendl; // TODO: (1) copy live objects from zone_num to a new zone, (2) issue a RESET // ZONE operation to the device for the corresponding zone. diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc index 03326d2e5fc..6933dd972d8 100644 --- a/src/os/bluestore/ZonedAllocator.cc +++ b/src/os/bluestore/ZonedAllocator.cc @@ -34,8 +34,7 @@ ZonedAllocator::ZonedAllocator(CephContext* cct, zone_size(_zone_size), first_seq_zone_num(_first_sequential_zone), starting_zone_num(first_seq_zone_num), - num_zones(size / zone_size), - num_zones_to_clean(0) + num_zones(size / zone_size) { ldout(cct, 10) << " size 0x" << std::hex << size << " zone size 0x" << zone_size << std::dec @@ -68,14 +67,7 @@ int64_t ZonedAllocator::allocate( << std::hex << want_size << std::dec << dendl; uint64_t zone_num = starting_zone_num; - auto p = zones_to_clean.lower_bound(zone_num); for ( ; zone_num < num_zones; ++zone_num) { - if (p != zones_to_clean.cend() && *p == zone_num) { - ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num - << " because it is being cleaned" << std::dec << dendl; - ++p; - continue; - } if (fits(want_size, zone_num)) { break; } @@ -110,8 +102,6 @@ int64_t ZonedAllocator::allocate( << " and zone offset 0x" << (offset % zone_size) << std::dec << dendl; - find_zones_to_clean(); - extents->emplace_back(bluestore_pextent_t(offset, want_size)); return want_size; } @@ -151,15 +141,11 @@ void ZonedAllocator::dump(std::function<void(uint64_t offset, } void ZonedAllocator::init_from_zone_pointers( - std::vector<zone_state_t> _zone_states, - ceph::mutex *_cleaner_lock, - ceph::condition_variable *_cleaner_cond) + std::vector<zone_state_t> _zone_states) { // this is called once, based on the device's zone pointers std::lock_guard l(lock); ldout(cct, 10) << dendl; - cleaner_lock = _cleaner_lock; - cleaner_cond = _cleaner_cond; zone_states = std::move(_zone_states); num_free = 0; for (size_t i = first_seq_zone_num; i < num_zones; ++i) { @@ -172,16 +158,34 @@ void ZonedAllocator::init_from_zone_pointers( << dendl; } -const std::set<uint64_t> *ZonedAllocator::get_zones_to_clean(void) +int64_t ZonedAllocator::pick_zone_to_clean(void) { - ldout(cct, 10) << dendl; - return num_zones_to_clean ? &zones_to_clean : nullptr; + int32_t best = -1; + int64_t best_score = 0; + for (size_t i = first_seq_zone_num; i < num_zones; ++i) { + int64_t score = zone_states[i].num_dead_bytes; + // discount by remaining space so we will tend to clean full zones + score -= (zone_size - zone_states[i].write_pointer) / 2; + if (score > 0 && (best < 0 || score > best_score)) { + best = i; + best_score = score; + } + } + if (best >= 0) { + ldout(cct, 10) << " zone 0x" << std::hex << best << " with score 0x" << best_score + << ": 0x" << zone_states[best].num_dead_bytes + << " dead and 0x" + << zone_states[best].write_pointer - zone_states[best].num_dead_bytes + << " live bytes" << std::dec << dendl; + } else { + ldout(cct, 10) << " no zones found that are good cleaning candidates" << dendl; + } + return best; } bool ZonedAllocator::low_on_space(void) { - ceph_assert(zones_to_clean.empty()); - + std::lock_guard l(lock); uint64_t sequential_num_free = num_free - conventional_size; double free_ratio = static_cast<double>(sequential_num_free) / sequential_size; @@ -194,67 +198,6 @@ bool ZonedAllocator::low_on_space(void) return free_ratio <= 0.25; } -void ZonedAllocator::find_zones_to_clean(void) -{ - ldout(cct, 40) << dendl; - - if (num_zones_to_clean || !low_on_space()) - return; - - ceph_assert(zones_to_clean.empty()); - - // TODO: make this tunable; handle the case when there aren't this many zones - // to clean. - const int64_t num_zones_to_clean_at_once = 1; - - std::vector<uint64_t> idx(num_zones); - std::iota(idx.begin(), idx.end(), 0); - - if (cct->_conf->subsys.should_gather<ceph_subsys_bluestore, 40>()) { - for (size_t i = 0; i < zone_states.size(); ++i) { - dout(40) << " zone 0x" << std::hex << i << std::dec << " " - << zone_states[i] << dendl; - } - } - - std::partial_sort(idx.begin(), idx.begin() + num_zones_to_clean_at_once, idx.end(), - [this](uint64_t i1, uint64_t i2) { - return zone_states[i1].num_dead_bytes > zone_states[i2].num_dead_bytes; - }); - - ldout(cct, 10) << " the zone that needs cleaning is 0x" - << std::hex << *idx.begin() << " num_dead_bytes = 0x" - << zone_states[*idx.begin()].num_dead_bytes - << std::dec - << dendl; - - zones_to_clean = {idx.begin(), idx.begin() + num_zones_to_clean_at_once}; - num_zones_to_clean = num_zones_to_clean_at_once; - - // TODO: handle the case of disk being full. - ceph_assert(!zones_to_clean.empty()); - ceph_assert(num_zones_to_clean != 0); - - cleaner_lock->lock(); - cleaner_cond->notify_one(); - cleaner_lock->unlock(); -} - -void ZonedAllocator::mark_zones_to_clean_free(void) -{ - std::lock_guard l(lock); - ldout(cct, 10) << dendl; - for (auto zone_num : zones_to_clean) { - ldout(cct, 10) << " zone 0x" << std::hex << zone_num - << " is now clean" << std::dec << dendl; - num_free += zone_states[zone_num].write_pointer; - zone_states[zone_num].num_dead_bytes = 0; - zone_states[zone_num].write_pointer = 0; - } - zones_to_clean.clear(); - num_zones_to_clean = 0; -} - void ZonedAllocator::shutdown() { ldout(cct, 1) << dendl; diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h index 872ee29325a..4dccb9f3806 100644 --- a/src/os/bluestore/ZonedAllocator.h +++ b/src/os/bluestore/ZonedAllocator.h @@ -39,11 +39,6 @@ class ZonedAllocator : public Allocator { uint64_t starting_zone_num; uint64_t num_zones; std::vector<zone_state_t> zone_states; - std::set<uint64_t> zones_to_clean; - std::atomic<int64_t> num_zones_to_clean; - - ceph::mutex *cleaner_lock = nullptr; - ceph::condition_variable *cleaner_cond = nullptr; inline uint64_t get_offset(uint64_t zone_num) const { return zone_num * zone_size + get_write_pointer(zone_num); @@ -96,13 +91,10 @@ public: void dump(std::function<void(uint64_t offset, uint64_t length)> notify) override; - const std::set<uint64_t> *get_zones_to_clean(void); - void mark_zones_to_clean_free(void); + int64_t pick_zone_to_clean(void); void init_from_zone_pointers( - std::vector<zone_state_t> _zone_states, - ceph::mutex *_cleaner_lock, - ceph::condition_variable *_cleaner_cond); + std::vector<zone_state_t> _zone_states); void init_add_free(uint64_t offset, uint64_t length) override {} void init_rm_free(uint64_t offset, uint64_t length) override {} @@ -110,7 +102,6 @@ public: private: bool low_on_space(void); - void find_zones_to_clean(void); }; #endif diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc index 06fecb60d26..60e72c2cd27 100644 --- a/src/os/bluestore/ZonedFreelistManager.cc +++ b/src/os/bluestore/ZonedFreelistManager.cc @@ -331,46 +331,14 @@ int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader) return 0; } -std::set<uint64_t> ZonedFreelistManager::get_cleaning_in_progress_zones( - KeyValueDB *kvdb) const -{ - bufferlist bl; - std::set<uint64_t> zones_to_clean; - if (kvdb->get(meta_prefix, CLEANING_IN_PROGRESS_KEY, &bl) == 0) { - decode(zones_to_clean, bl); - } - return zones_to_clean; -} - -void ZonedFreelistManager::mark_zones_to_clean_free( - const std::set<uint64_t>& zones_to_clean, KeyValueDB *kvdb) -{ - dout(10) << __func__ << dendl; - - KeyValueDB::Transaction txn = kvdb->get_transaction(); - for (auto zone_num : zones_to_clean) { - ldout(cct, 10) << __func__ << " zone " << zone_num << " is now clean in DB" << dendl; - - zone_state_t zone_state; - write_zone_state_to_db(zone_num, zone_state, txn); - } - txn->rmkey(meta_prefix, CLEANING_IN_PROGRESS_KEY); - kvdb->submit_transaction_sync(txn); -} - -// Marks the zones currently being cleaned in the db. Should be called before -// starting the cleaning. If we crash mid-cleaning, the recovery code will check -// if there is a key CLEANING_IN_PROGRESS_KEY in the meta_prefix namespace, and -// if so, will read the zones and resume cleaning. -void ZonedFreelistManager::mark_zones_to_clean_in_progress( - const std::set<uint64_t>& zones_to_clean, KeyValueDB *kvdb) +void ZonedFreelistManager::mark_zone_to_clean_free( + uint64_t zone, + KeyValueDB *kvdb) { - dout(10) << __func__ << dendl; + dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl; - bufferlist bl; - encode(zones_to_clean, bl); - KeyValueDB::Transaction txn = kvdb->get_transaction(); - txn->set(meta_prefix, CLEANING_IN_PROGRESS_KEY, bl); + zone_state_t zone_state; + write_zone_state_to_db(zone, zone_state, txn); kvdb->submit_transaction_sync(txn); } diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h index 7ad07233698..e60a40222eb 100644 --- a/src/os/bluestore/ZonedFreelistManager.h +++ b/src/os/bluestore/ZonedFreelistManager.h @@ -22,8 +22,6 @@ using cfg_reader_t = std::function<int(const std::string&, std::string*)>; -const std::string CLEANING_IN_PROGRESS_KEY = "cleaning_in_progress"; - class ZonedFreelistManager : public FreelistManager { std::string meta_prefix; ///< device size, zone size, etc. std::string info_prefix; ///< per zone write pointer, dead bytes @@ -104,11 +102,8 @@ public: std::vector<std::pair<std::string, std::string>>*) const override; std::vector<zone_state_t> get_zone_states(KeyValueDB *kvdb) const; - std::set<uint64_t> get_cleaning_in_progress_zones(KeyValueDB *kvdb) const; - void mark_zones_to_clean_free(const std::set<uint64_t>& zones_to_clean, - KeyValueDB *kvdb); - void mark_zones_to_clean_in_progress(const std::set<uint64_t>& zones_to_clean, - KeyValueDB *kvdb); + + void mark_zone_to_clean_free(uint64_t zone, KeyValueDB *kvdb); }; #endif |