diff options
Diffstat (limited to 'src/crimson')
125 files changed, 4652 insertions, 2829 deletions
diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc index de9626a2f2d..41da72c9fde 100644 --- a/src/crimson/admin/osd_admin.cc +++ b/src/crimson/admin/osd_admin.cc @@ -14,6 +14,7 @@ #include "common/config.h" #include "crimson/admin/admin_socket.h" #include "crimson/common/log.h" +#include "crimson/common/perf_counters_collection.h" #include "crimson/osd/exceptions.h" #include "crimson/osd/osd.h" #include "crimson/osd/pg.h" diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h index 2a91ac39540..db62a2df32d 100644 --- a/src/crimson/common/fixed_kv_node_layout.h +++ b/src/crimson/common/fixed_kv_node_layout.h @@ -360,11 +360,16 @@ public: } - FixedKVNodeLayout(char *buf) : - buf(buf) {} + FixedKVNodeLayout() : buf(nullptr) {} virtual ~FixedKVNodeLayout() = default; + void set_layout_buf(char *_buf) { + assert(buf == nullptr); + assert(_buf != nullptr); + buf = _buf; + } + const_iterator begin() const { return const_iterator( this, diff --git a/src/crimson/common/logclient.cc b/src/crimson/common/logclient.cc index d402ecd1901..a3c30227bc7 100644 --- a/src/crimson/common/logclient.cc +++ b/src/crimson/common/logclient.cc @@ -7,6 +7,7 @@ #include "crimson/net/Messenger.h" #include "crimson/mon/MonClient.h" #include "mon/MonMap.h" +#include "common/Clock.h" // for ceph_clock_now() #include "common/Graylog.h" using std::map; diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h index 92d99d332c4..0d73658e709 100644 --- a/src/crimson/common/shared_lru.h +++ b/src/crimson/common/shared_lru.h @@ -25,12 +25,17 @@ class SharedLRU { SimpleLRU<K, shared_ptr_t, false> cache; std::map<K, std::pair<weak_ptr_t, V*>> weak_refs; + // Once all of the shared pointers are destoryed, + // erase the tracked object from the weak_ref map + // before actually destorying it struct Deleter { - SharedLRU<K,V>* cache; + SharedLRU<K,V>* shared_lru_ptr; const K key; - void operator()(V* ptr) { - cache->_erase_weak(key); - delete ptr; + void operator()(V* value_ptr) { + if (shared_lru_ptr) { + shared_lru_ptr->_erase_weak(key); + } + delete value_ptr; } }; void _erase_weak(const K& key) { @@ -42,9 +47,19 @@ public: {} ~SharedLRU() { cache.clear(); + // initially, we were assuming that no pointer obtained from SharedLRU // can outlive the lru itself. However, since going with the interruption // concept for handling shutdowns, this is no longer valid. + // Moreover, before clearing weak_refs, invalidate each deleter + // cache pointer as this SharedLRU is being destoryed. + for (const auto& [key, value] : weak_refs) { + shared_ptr_t val; + val = value.first.lock(); + auto this_deleter = get_deleter<Deleter>(val); + this_deleter->shared_lru_ptr = nullptr; + } + weak_refs.clear(); } /** diff --git a/src/crimson/common/tmap_helpers.cc b/src/crimson/common/tmap_helpers.cc index 9c14ebc450e..58c4fc7e218 100644 --- a/src/crimson/common/tmap_helpers.cc +++ b/src/crimson/common/tmap_helpers.cc @@ -7,6 +7,8 @@ #include "include/encoding.h" #include "include/rados.h" +#include <map> + namespace detail { #define decode_or_return(v, bp) \ diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc index 4919f0bf21f..4c076cf43c6 100644 --- a/src/crimson/mon/MonClient.cc +++ b/src/crimson/mon/MonClient.cc @@ -13,6 +13,7 @@ #include "auth/AuthClientHandler.h" #include "auth/RotatingKeyRing.h" +#include "common/Clock.h" // for ceph_clock_now() #include "common/hostname.h" #include "include/utime_fmt.h" diff --git a/src/crimson/net/Socket.cc b/src/crimson/net/Socket.cc index 2c729f4e8c2..3a7aeaf9651 100644 --- a/src/crimson/net/Socket.cc +++ b/src/crimson/net/Socket.cc @@ -8,6 +8,7 @@ #include <seastar/net/packet.hh> #include "crimson/common/log.h" +#include "include/random.h" // for ceph::util::generate_random_number() #include "Errors.h" using crimson::common::local_conf; diff --git a/src/crimson/net/io_handler.cc b/src/crimson/net/io_handler.cc index b93124f3c12..bc5e9bf404c 100644 --- a/src/crimson/net/io_handler.cc +++ b/src/crimson/net/io_handler.cc @@ -347,7 +347,7 @@ void IOHandler::do_set_io_state( { ceph_assert_always(seastar::this_shard_id() == get_shard_id()); auto prv_state = get_io_state(); - logger().debug("{} got {}do_set_io_state(): prv_state={}, new_state={}, " + logger().debug("{} got {} do_set_io_state(): prv_state={}, new_state={}, " "fa={}, set_notify_out={}, at {}", conn, cc_seq.has_value() ? fmt::format("{} ", *cc_seq) : "", @@ -984,7 +984,7 @@ void IOHandler::notify_out_dispatch() }); }); } - if (shard_states->try_enter_out_dispatching()) { + if (shard_states->try_enter_out_dispatching(conn)) { shard_states->dispatch_in_background( "do_out_dispatch", conn, [this] { return do_out_dispatch(*shard_states); diff --git a/src/crimson/net/io_handler.h b/src/crimson/net/io_handler.h index 5986fcb16ac..41c76ab925b 100644 --- a/src/crimson/net/io_handler.h +++ b/src/crimson/net/io_handler.h @@ -309,7 +309,7 @@ public: in_exit_dispatching = std::nullopt; } - bool try_enter_out_dispatching() { + bool try_enter_out_dispatching(SocketConnection &conn) { assert(seastar::this_shard_id() == sid); if (out_dispatching) { // already dispatching out @@ -327,6 +327,9 @@ public: // do not dispatch out return false; default: + crimson::get_logger(ceph_subsys_ms).error( + "{} try_enter_out_dispatching() got wrong io_state {}", + conn, io_state); ceph_abort("impossible"); } } @@ -574,6 +577,8 @@ struct fmt::formatter<crimson::net::IOHandler::io_state_t> case switched: name = "switched"; break; + default: + name = "undefined"; } return formatter<string_view>::format(name, ctx); } diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc index 3fd2bb1fd15..db6decd84f9 100644 --- a/src/crimson/os/alienstore/alien_store.cc +++ b/src/crimson/os/alienstore/alien_store.cc @@ -141,7 +141,8 @@ seastar::future<> AlienStore::stop() AlienStore::base_errorator::future<bool> AlienStore::exists( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { return op_gates.simple_dispatch("exists", [=, this] { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] { @@ -212,7 +213,8 @@ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> AlienStore::list_objects(CollectionRef ch, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const + uint64_t limit, + uint32_t op_flags) const { logger().debug("{}", __func__); assert(tp); @@ -348,7 +350,8 @@ AlienStore::readv(CollectionRef ch, AlienStore::get_attr_errorator::future<ceph::bufferlist> AlienStore::get_attr(CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { logger().debug("{}", __func__); assert(tp); @@ -376,7 +379,8 @@ AlienStore::get_attr(CollectionRef ch, AlienStore::get_attrs_ertr::future<AlienStore::attrs_t> AlienStore::get_attrs(CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { logger().debug("{}", __func__); assert(tp); @@ -397,7 +401,8 @@ AlienStore::get_attrs(CollectionRef ch, auto AlienStore::omap_get_values(CollectionRef ch, const ghobject_t& oid, - const set<string>& keys) + const set<string>& keys, + uint32_t op_flags) -> read_errorator::future<omap_values_t> { logger().debug("{}", __func__); @@ -421,7 +426,8 @@ auto AlienStore::omap_get_values(CollectionRef ch, auto AlienStore::omap_get_values(CollectionRef ch, const ghobject_t &oid, - const std::optional<string> &start) + const std::optional<string> &start, + uint32_t op_flags) -> read_errorator::future<std::tuple<bool, omap_values_t>> { logger().debug("{} with_start", __func__); @@ -429,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch, return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] { auto c = static_cast<AlienCollection*>(ch.get()); - return store->omap_get_values(c->collection, oid, start, - reinterpret_cast<map<string, bufferlist>*>(&values)); + return store->omap_iterate( + c->collection, oid, + ObjectStore::omap_iter_seek_t{ + .seek_position = start.value_or(std::string{}), + // FIXME: classical OSDs begins iteration from LOWER_BOUND + // (or UPPER_BOUND if filter_prefix > start). However, these + // bits are not implemented yet + .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND + }, + [&values] + (std::string_view key, std::string_view value) mutable { + values[std::string{key}].append(value); + // FIXME: there is limit on number of entries yet + return ObjectStore::omap_iter_ret_t::NEXT; + }); }).then([&values] (int r) -> read_errorator::future<std::tuple<bool, omap_values_t>> { if (r == -ENOENT) { @@ -578,7 +597,8 @@ unsigned AlienStore::get_max_attr_name_length() const seastar::future<struct stat> AlienStore::stat( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { assert(tp); return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) { @@ -590,8 +610,22 @@ seastar::future<struct stat> AlienStore::stat( }); } +seastar::future<std::string> AlienStore::get_default_device_class() +{ + logger().debug("{}", __func__); + assert(tp); + return op_gates.simple_dispatch("get_default_device_class", [=, this] { + return tp->submit([=, this] { + return store->get_default_device_class(); + }).then([] (std::string device_class) { + return seastar::make_ready_future<std::string>(device_class); + }); + }); +} + auto AlienStore::omap_get_header(CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) -> get_attr_errorator::future<ceph::bufferlist> { assert(tp); @@ -617,7 +651,8 @@ AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fie CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { assert(tp); return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) { diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h index d36f449afd8..1d39411450e 100644 --- a/src/crimson/os/alienstore/alien_store.h +++ b/src/crimson/os/alienstore/alien_store.h @@ -36,7 +36,8 @@ public: base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final; read_errorator::future<ceph::bufferlist> read(CollectionRef c, const ghobject_t& oid, @@ -49,29 +50,36 @@ public: uint32_t op_flags = 0) final; - get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c, - const ghobject_t& oid, - std::string_view name) const final; - get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c, - const ghobject_t& oid) final; + get_attr_errorator::future<ceph::bufferlist> get_attr( + CollectionRef c, + const ghobject_t& oid, + std::string_view name, + uint32_t op_flags = 0) const final; + get_attrs_ertr::future<attrs_t> get_attrs( + CollectionRef c, + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; /// Retrieves paged set of values > start (if present) read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; ///< @return <done, values> values.empty() iff done seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; seastar::future<CollectionRef> open_collection(const coll_t& cid) final; @@ -97,15 +105,19 @@ public: unsigned get_max_attr_name_length() const final; seastar::future<struct stat> stat( CollectionRef, - const ghobject_t&) final; + const ghobject_t&, + uint32_t op_flags = 0) final; + seastar::future<std::string> get_default_device_class() final; get_attr_errorator::future<ceph::bufferlist> omap_get_header( CollectionRef, - const ghobject_t&) final; + const ghobject_t&, + uint32_t) final; read_errorator::future<std::map<uint64_t, uint64_t>> fiemap( CollectionRef, const ghobject_t&, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags) final; FuturizedStore::Shard& get_sharded_store() final { return *this; diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc index 5cf9590e61e..2d208548b32 100644 --- a/src/crimson/os/alienstore/thread_pool.cc +++ b/src/crimson/os/alienstore/thread_pool.cc @@ -7,6 +7,7 @@ #include <pthread.h> #include "include/ceph_assert.h" +#include "include/intarith.h" // for round_up_to() #include "crimson/common/config_proxy.h" using crimson::common::local_conf; @@ -27,7 +28,7 @@ ThreadPool::ThreadPool(size_t n_threads, pin(*cpus); } block_sighup(); - (void) pthread_setname_np(pthread_self(), "alien-store-tp"); + (void) ceph_pthread_setname("alien-store-tp"); loop(queue_max_wait, i); }); } diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc index 7b945e5aa15..41819fb5eb6 100644 --- a/src/crimson/os/cyanstore/cyan_store.cc +++ b/src/crimson/os/cyanstore/cyan_store.cc @@ -12,6 +12,7 @@ #include "crimson/common/buffer_io.h" #include "crimson/common/config_proxy.h" +#include "crimson/common/perf_counters_collection.h" #include "cyan_collection.h" #include "cyan_object.h" @@ -143,6 +144,12 @@ CyanStore::list_collections() }); } +seastar::future<std::string> +CyanStore::get_default_device_class() +{ + return seastar::make_ready_future<std::string>(""); +} + CyanStore::mount_ertr::future<> CyanStore::Shard::mount() { static const char read_file_errmsg[]{"read_file"}; @@ -201,7 +208,8 @@ CyanStore::Shard::list_objects( CollectionRef ch, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const + uint64_t limit, + uint32_t op_flags) const { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {} {} {}", @@ -250,7 +258,8 @@ CyanStore::Shard::list_collections() CyanStore::Shard::base_errorator::future<bool> CyanStore::Shard::exists( CollectionRef ch, - const ghobject_t &oid) + const ghobject_t &oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); if (!c->exists) { @@ -326,7 +335,8 @@ CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> CyanStore::Shard::get_attr( CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {}", @@ -345,7 +355,8 @@ CyanStore::Shard::get_attr( CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t> CyanStore::Shard::get_attrs( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); logger().debug("{} {} {}", @@ -360,7 +371,8 @@ CyanStore::Shard::get_attrs( auto CyanStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t& oid, - const omap_keys_t& keys) + const omap_keys_t& keys, + uint32_t op_flags) -> read_errorator::future<omap_values_t> { auto c = static_cast<Collection*>(ch.get()); @@ -381,7 +393,8 @@ auto CyanStore::Shard::omap_get_values( auto CyanStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional<string> &start) + const std::optional<string> &start, + uint32_t op_flags) -> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>> { auto c = static_cast<Collection*>(ch.get()); @@ -402,7 +415,8 @@ auto CyanStore::Shard::omap_get_values( auto CyanStore::Shard::omap_get_header( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) -> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist> { auto c = static_cast<Collection*>(ch.get()); @@ -970,7 +984,8 @@ CyanStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); @@ -985,7 +1000,8 @@ CyanStore::Shard::fiemap( seastar::future<struct stat> CyanStore::Shard::stat( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { auto c = static_cast<Collection*>(ch.get()); auto o = c->get_object(oid); diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h index 99583d07d36..1d481ef5829 100644 --- a/src/crimson/os/cyanstore/cyan_store.h +++ b/src/crimson/os/cyanstore/cyan_store.h @@ -34,11 +34,13 @@ public: seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; base_errorator::future<bool> exists( CollectionRef ch, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<ceph::bufferlist> read( CollectionRef c, @@ -56,33 +58,39 @@ public: get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const final; + std::string_view name, + uint32_t op_flags = 0) const final; get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; get_attr_errorator::future<ceph::bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; @@ -101,7 +109,8 @@ public: CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags) final; unsigned get_max_attr_name_length() const final; @@ -221,6 +230,8 @@ public: seastar::future<std::vector<coll_core_t>> list_collections() final; + seastar::future<std::string> get_default_device_class() final; + private: seastar::sharded<CyanStore::Shard> shard_stores; const std::string path; diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h index 0dca695ba3a..e7d4c8546de 100644 --- a/src/crimson/os/futurized_store.h +++ b/src/crimson/os/futurized_store.h @@ -54,7 +54,8 @@ public: virtual base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; using get_attr_errorator = crimson::errorator< crimson::ct_error::enoent, @@ -62,42 +63,49 @@ public: virtual get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const = 0; + std::string_view name, + uint32_t op_flags = 0) const = 0; using get_attrs_ertr = crimson::errorator< crimson::ct_error::enoent>; using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>; virtual get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; virtual seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; using omap_values_t = attrs_t; using omap_keys_t = std::set<std::string>; virtual read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) = 0; + const omap_keys_t& keys, + uint32_t op_flags = 0) = 0; using omap_values_paged_t = std::tuple<bool, omap_values_t>; virtual read_errorator::future<omap_values_paged_t> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) = 0; ///< @return <done, values> values.empty() only if done virtual get_attr_errorator::future<bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) = 0; + const ghobject_t& oid, + uint32_t op_flags = 0) = 0; virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects( CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const = 0; + uint64_t limit, + uint32_t op_flags = 0) const = 0; virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0; @@ -153,7 +161,8 @@ public: CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) = 0; + uint64_t len, + uint32_t op_flags = 0) = 0; virtual unsigned get_max_attr_name_length() const = 0; }; @@ -203,6 +212,7 @@ public: using coll_core_t = std::pair<coll_t, core_id_t>; virtual seastar::future<std::vector<coll_core_t>> list_collections() = 0; + virtual seastar::future<std::string> get_default_device_class() = 0; protected: const core_id_t primary_core; }; diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 4bdbab8c4e5..3da5e65ceec 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -1,9 +1,11 @@ set(crimson_seastore_srcs cached_extent.cc + lba_mapping.cc seastore_types.cc segment_manager.cc segment_manager/ephemeral.cc segment_manager/block.cc + transaction_interruptor.cc transaction_manager.cc transaction.cc cache.cc @@ -18,7 +20,6 @@ set(crimson_seastore_srcs omap_manager.cc omap_manager/btree/btree_omap_manager.cc omap_manager/btree/omap_btree_node_impl.cc - btree/btree_range_pin.cc btree/fixed_kv_node.cc onode.cc onode_manager/staged-fltree/node.cc diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc index 5046980eae5..64e6749562e 100644 --- a/src/crimson/os/seastore/async_cleaner.cc +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -131,7 +131,7 @@ void segments_info_t::add_segment_manager( auto ssize = segment_manager.get_segment_size(); auto nsegments = segment_manager.get_num_segments(); auto sm_size = segment_manager.get_available_size(); - INFO("adding segment manager {}, size={}, ssize={}, segments={}", + INFO("adding segment manager {}, size=0x{:x}, segment size=0x{:x}, segments={}", device_id_printer_t{d_id}, sm_size, ssize, nsegments); ceph_assert(ssize > 0); ceph_assert(nsegments > 0); @@ -329,9 +329,9 @@ std::ostream &operator<<(std::ostream &os, const segments_info_t &infos) << ", closed=" << infos.get_num_closed() << ", type_journal=" << infos.get_num_type_journal() << ", type_ool=" << infos.get_num_type_ool() - << ", total=" << infos.get_total_bytes() << "B" - << ", available=" << infos.get_available_bytes() << "B" - << ", unavailable=" << infos.get_unavailable_bytes() << "B" + << ", total=0x" << std::hex << infos.get_total_bytes() << "B" + << ", available=0x" << infos.get_available_bytes() << "B" + << ", unavailable=0x" << infos.get_unavailable_bytes() << "B" << std::dec << ", available_ratio=" << infos.get_available_ratio() << ", submitted_head=" << infos.get_submitted_journal_head() << ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()} @@ -609,6 +609,7 @@ JournalTrimmerImpl::trim_alloc() return extent_callback->with_transaction_intr( Transaction::src_t::TRIM_ALLOC, "trim_alloc", + CACHE_HINT_NOCACHE, [this, FNAME](auto &t) { auto target = get_alloc_tail_target(); @@ -653,6 +654,7 @@ JournalTrimmerImpl::trim_dirty() return extent_callback->with_transaction_intr( Transaction::src_t::TRIM_DIRTY, "trim_dirty", + CACHE_HINT_NOCACHE, [this, FNAME](auto &t) { auto target = get_dirty_tail_target(); @@ -765,10 +767,10 @@ int64_t SpaceTrackerDetailed::SegmentMap::allocate( for (auto i = b; i < e; ++i) { if (bitmap[i]) { if (!error) { - ERROR("found allocated in {}, {} ~ {}", segment, offset, len); + ERROR("found allocated in {}, 0x{:x}~0x{:x}", segment, offset, len); error = true; } - DEBUG("block {} allocated", i * block_size); + DEBUG("block 0x{:x}B allocated", i * block_size); } bitmap[i] = true; } @@ -792,10 +794,10 @@ int64_t SpaceTrackerDetailed::SegmentMap::release( for (auto i = b; i < e; ++i) { if (!bitmap[i]) { if (!error) { - ERROR("found unallocated in {}, {} ~ {}", segment, offset, len); + ERROR("found unallocated in {}, 0x{:x}~0x{:x}", segment, offset, len); error = true; } - DEBUG("block {} unallocated", i * block_size); + DEBUG("block 0x{:x}B unallocated", i * block_size); } bitmap[i] = false; } @@ -831,7 +833,7 @@ void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const INFO("dump start"); for (unsigned i = 0; i < bitmap.size(); ++i) { if (bitmap[i]) { - LOCAL_LOGGER.info(" {} still live", i * block_size); + LOCAL_LOGGER.info(" 0x{:x}B still live", i * block_size); } } } @@ -847,7 +849,7 @@ void SpaceTrackerDetailed::dump_usage(segment_id_t id) const void SpaceTrackerSimple::dump_usage(segment_id_t id) const { LOG_PREFIX(SpaceTrackerSimple::dump_usage); - INFO("id: {}, live_bytes: {}", + INFO("id: {}, live_bytes: 0x{:x}", id, live_bytes_by_segment[id].live_bytes); } @@ -1125,6 +1127,7 @@ SegmentCleaner::do_reclaim_space( return extent_callback->with_transaction_intr( src, "clean_reclaim_space", + CACHE_HINT_NOCACHE, [this, &backref_extents, &pin_list, &reclaimed](auto &t) { return seastar::do_with( @@ -1142,8 +1145,7 @@ SegmentCleaner::do_reclaim_space( pin->get_key(), pin->get_val(), pin->get_length(), - pin->get_type(), - JOURNAL_SEQ_NULL); + pin->get_type()); } for (auto &cached_backref : cached_backref_entries) { if (cached_backref.laddr == L_ADDR_NULL) { @@ -1165,7 +1167,7 @@ SegmentCleaner::do_reclaim_space( [this, &extents, &t](auto &ent) { LOG_PREFIX(SegmentCleaner::do_reclaim_space); - TRACET("getting extent of type {} at {}~{}", + TRACET("getting extent of type {} at {}~0x{:x}", t, ent.type, ent.paddr, @@ -1241,6 +1243,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space() return extent_callback->with_transaction_intr( Transaction::src_t::READ, "retrieve_from_backref_tree", + CACHE_HINT_NOCACHE, [this, &weak_read_ret](auto &t) { return backref_manager.get_mappings( t, @@ -1507,6 +1510,7 @@ bool SegmentCleaner::check_usage() SpaceTrackerIRef tracker(space_tracker->make_empty()); extent_callback->with_transaction_weak( "check_usage", + CACHE_HINT_NOCACHE, [this, &tracker](auto &t) { return backref_manager.scan_mapped_space( t, @@ -1568,7 +1572,7 @@ void SegmentCleaner::mark_space_used( background_callback->maybe_wake_background(); assert(ret > 0); - DEBUG("segment {} new len: {}~{}, live_bytes: {}", + DEBUG("segment {} new len: {}~0x{:x}, live_bytes: 0x{:x}", seg_addr.get_segment_id(), addr, len, @@ -1591,7 +1595,7 @@ void SegmentCleaner::mark_space_free( stats.used_bytes -= len; auto& seg_addr = addr.as_seg_paddr(); - DEBUG("segment {} free len: {}~{}", + DEBUG("segment {} free len: {}~0x{:x}", seg_addr.get_segment_id(), addr, len); auto old_usage = calc_utilization(seg_addr.get_segment_id()); [[maybe_unused]] auto ret = space_tracker->release( @@ -1602,7 +1606,7 @@ void SegmentCleaner::mark_space_free( adjust_segment_util(old_usage, new_usage); background_callback->maybe_wake_blocked_io(); assert(ret >= 0); - DEBUG("segment {} free len: {}~{}, live_bytes: {}", + DEBUG("segment {} free len: {}~0x{:x}, live_bytes: 0x{:x}", seg_addr.get_segment_id(), addr, len, @@ -1687,11 +1691,11 @@ void SegmentCleaner::print(std::ostream &os, bool is_detailed) const << ", reclaim_ratio=" << get_reclaim_ratio() << ", alive_ratio=" << get_alive_ratio(); if (is_detailed) { - os << ", unavailable_unreclaimable=" + os << ", unavailable_unreclaimable=0x" << std::hex << get_unavailable_unreclaimable_bytes() << "B" - << ", unavailable_reclaimble=" + << ", unavailable_reclaimble=0x" << get_unavailable_reclaimable_bytes() << "B" - << ", alive=" << stats.used_bytes << "B" + << ", alive=0x" << stats.used_bytes << "B" << std::dec << ", " << segments; } os << ")"; @@ -1722,7 +1726,7 @@ void RBMCleaner::mark_space_used( for (auto rbm : rbms) { if (addr.get_device_id() == rbm->get_device_id()) { if (rbm->get_start() <= addr) { - DEBUG("allocate addr: {} len: {}", addr, len); + DEBUG("allocate addr: {} len: 0x{:x}", addr, len); stats.used_bytes += len; rbm->mark_space_used(addr, len); } @@ -1741,7 +1745,7 @@ void RBMCleaner::mark_space_free( for (auto rbm : rbms) { if (addr.get_device_id() == rbm->get_device_id()) { if (rbm->get_start() <= addr) { - DEBUG("free addr: {} len: {}", addr, len); + DEBUG("free addr: {} len: 0x{:x}", addr, len); ceph_assert(stats.used_bytes >= len); stats.used_bytes -= len; rbm->mark_space_free(addr, len); @@ -1813,6 +1817,7 @@ bool RBMCleaner::check_usage() RBMSpaceTracker tracker(rbms); extent_callback->with_transaction_weak( "check_usage", + CACHE_HINT_NOCACHE, [this, &tracker, &rbms](auto &t) { return backref_manager.scan_mapped_space( t, diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h index 424247c5bdc..1cef771aeb8 100644 --- a/src/crimson/os/seastore/async_cleaner.h +++ b/src/crimson/os/seastore/async_cleaner.h @@ -17,6 +17,7 @@ #include "crimson/os/seastore/randomblock_manager_group.h" #include "crimson/os/seastore/transaction.h" #include "crimson/os/seastore/segment_seq_allocator.h" +#include "crimson/os/seastore/backref_mapping.h" namespace crimson::os::seastore { @@ -299,24 +300,29 @@ public: /// Creates empty transaction /// weak transaction should be type READ virtual TransactionRef create_transaction( - Transaction::src_t, const char *name, bool is_weak=false) = 0; + Transaction::src_t, + const char *name, + cache_hint_t cache_hint = CACHE_HINT_TOUCH, + bool is_weak=false) = 0; /// Creates empty transaction with interruptible context template <typename Func> auto with_transaction_intr( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, Func &&f) { return do_with_transaction_intr<Func, false>( - src, name, std::forward<Func>(f)); + src, name, cache_hint, std::forward<Func>(f)); } template <typename Func> auto with_transaction_weak( const char* name, + cache_hint_t cache_hint, Func &&f) { return do_with_transaction_intr<Func, true>( - Transaction::src_t::READ, name, std::forward<Func>(f) + Transaction::src_t::READ, name, cache_hint, std::forward<Func>(f) ).handle_error( crimson::ct_error::eagain::assert_failure{"unexpected eagain"}, crimson::ct_error::pass_further_all{} @@ -385,9 +391,10 @@ private: auto do_with_transaction_intr( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, Func &&f) { return seastar::do_with( - create_transaction(src, name, IsWeak), + create_transaction(src, name, cache_hint, IsWeak), [f=std::forward<Func>(f)](auto &ref_t) mutable { return with_trans_intr( *ref_t, diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index f89698d602a..9cbf65f4033 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -28,28 +28,22 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node< ceph_assert(backref_root->is_initial_pending() == root_block->is_pending()); return {true, - trans_intr::make_interruptible( - c.cache.get_extent_viewable_by_trans(c.trans, backref_root))}; + c.cache.get_extent_viewable_by_trans(c.trans, backref_root)}; } else if (root_block->is_pending()) { auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance()); backref_root = prior.backref_root_node; if (backref_root) { return {true, - trans_intr::make_interruptible( - c.cache.get_extent_viewable_by_trans(c.trans, backref_root))}; + c.cache.get_extent_viewable_by_trans(c.trans, backref_root)}; } else { c.cache.account_absent_access(c.trans.get_src()); return {false, - trans_intr::make_interruptible( - Cache::get_extent_ertr::make_ready_future< - CachedExtentRef>())}; + Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()}; } } else { c.cache.account_absent_access(c.trans.get_src()); return {false, - trans_intr::make_interruptible( - Cache::get_extent_ertr::make_ready_future< - CachedExtentRef>())}; + Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()}; } } diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 38084bb00e6..24897dd55da 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -9,44 +9,28 @@ namespace crimson::os::seastore::backref { -constexpr size_t BACKREF_BLOCK_SIZE = 4096; - -class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> { - extent_types_t type; +class BtreeBackrefMapping : public BackrefMapping { public: BtreeBackrefMapping(op_context_t<paddr_t> ctx) - : BtreeNodeMapping(ctx) {} + : BackrefMapping(ctx) {} BtreeBackrefMapping( op_context_t<paddr_t> ctx, CachedExtentRef parent, uint16_t pos, backref_map_val_t &val, backref_node_meta_t &&meta) - : BtreeNodeMapping( + : BackrefMapping( + val.type, ctx, parent, pos, val.laddr, val.len, - std::forward<backref_node_meta_t>(meta)), - type(val.type) - {} - extent_types_t get_type() const final { - return type; - } - - bool is_clone() const final { - return false; - } - -protected: - std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate( - op_context_t<paddr_t> ctx) const final { - return std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>>( - new BtreeBackrefMapping(ctx)); - } + std::forward<backref_node_meta_t>(meta)) {} }; +constexpr size_t BACKREF_BLOCK_SIZE = 4096; + using BackrefBtree = FixedKVBtree< paddr_t, backref_map_val_t, BackrefInternalNode, BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>; diff --git a/src/crimson/os/seastore/backref_entry.h b/src/crimson/os/seastore/backref_entry.h new file mode 100644 index 00000000000..5f9becc9565 --- /dev/null +++ b/src/crimson/os/seastore/backref_entry.h @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <memory> +#include <iostream> + +#if FMT_VERSION >= 90000 +#include <fmt/ostream.h> +#endif + +#include <boost/intrusive/set.hpp> + +#include "crimson/os/seastore/seastore_types.h" + +namespace crimson::os::seastore { + +struct backref_entry_t { + using ref_t = std::unique_ptr<backref_entry_t>; + + backref_entry_t( + const paddr_t& paddr, + const laddr_t& laddr, + extent_len_t len, + extent_types_t type) + : paddr(paddr), + laddr(laddr), + len(len), + type(type) { + assert(len > 0); + } + paddr_t paddr = P_ADDR_NULL; + laddr_t laddr = L_ADDR_NULL; + extent_len_t len = 0; + extent_types_t type = extent_types_t::NONE; + friend bool operator< ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr < r.paddr; + } + friend bool operator> ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr > r.paddr; + } + friend bool operator== ( + const backref_entry_t &l, + const backref_entry_t &r) { + return l.paddr == r.paddr; + } + + using set_hook_t = + boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + set_hook_t backref_set_hook; + using backref_set_member_options = boost::intrusive::member_hook< + backref_entry_t, + set_hook_t, + &backref_entry_t::backref_set_hook>; + using multiset_t = boost::intrusive::multiset< + backref_entry_t, + backref_set_member_options, + boost::intrusive::constant_time_size<false>>; + + struct cmp_t { + using is_transparent = paddr_t; + bool operator()( + const backref_entry_t &l, + const backref_entry_t &r) const { + return l.paddr < r.paddr; + } + bool operator()(const paddr_t l, const backref_entry_t &r) const { + return l < r.paddr; + } + bool operator()(const backref_entry_t &l, const paddr_t r) const { + return l.paddr < r; + } + }; + + static ref_t create_alloc( + const paddr_t& paddr, + const laddr_t& laddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type)); + assert(laddr != L_ADDR_NULL); + return std::make_unique<backref_entry_t>( + paddr, laddr, len, type); + } + + static ref_t create_retire( + const paddr_t& paddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type) || + is_retired_placeholder_type(type)); + return std::make_unique<backref_entry_t>( + paddr, L_ADDR_NULL, len, type); + } + + static ref_t create(const alloc_blk_t& delta) { + return std::make_unique<backref_entry_t>( + delta.paddr, delta.laddr, delta.len, delta.type); + } +}; + +inline std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) { + return out << "backref_entry_t{" + << ent.paddr << "~0x" << std::hex << ent.len << std::dec << ", " + << "laddr: " << ent.laddr << ", " + << "type: " << ent.type + << "}"; +} + +using backref_entry_ref = backref_entry_t::ref_t; +using backref_entry_mset_t = backref_entry_t::multiset_t; +using backref_entry_refs_t = std::vector<backref_entry_ref>; +using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>; +using backref_entry_query_set_t = std::set<backref_entry_t, backref_entry_t::cmp_t>; + +} // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::backref_entry_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h index 3feedb997b4..8c746b571b2 100644 --- a/src/crimson/os/seastore/backref_manager.h +++ b/src/crimson/os/seastore/backref_manager.h @@ -6,6 +6,7 @@ #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/transaction.h" +#include "crimson/os/seastore/backref_mapping.h" namespace crimson::os::seastore { diff --git a/src/crimson/os/seastore/backref_mapping.h b/src/crimson/os/seastore/backref_mapping.h new file mode 100644 index 00000000000..d0a6a0ea6ff --- /dev/null +++ b/src/crimson/os/seastore/backref_mapping.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/btree/btree_range_pin.h" + +namespace crimson::os::seastore { + +class BackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> { + extent_types_t type; +public: + BackrefMapping(op_context_t<paddr_t> ctx) + : BtreeNodeMapping(ctx) {} + template <typename... T> + BackrefMapping(extent_types_t type, T&&... t) + : BtreeNodeMapping(std::forward<T>(t)...), + type(type) {} + extent_types_t get_type() const { + return type; + } +}; + +using BackrefMappingRef = std::unique_ptr<BackrefMapping>; +using backref_pin_list_t = std::list<BackrefMappingRef>; + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc deleted file mode 100644 index f0d507a24c4..00000000000 --- a/src/crimson/os/seastore/btree/btree_range_pin.cc +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "crimson/os/seastore/btree/btree_range_pin.h" -#include "crimson/os/seastore/btree/fixed_kv_node.h" - -namespace crimson::os::seastore { - -template <typename key_t, typename val_t> -get_child_ret_t<LogicalCachedExtent> -BtreeNodeMapping<key_t, val_t>::get_logical_extent( - Transaction &t) -{ - ceph_assert(is_parent_viewable()); - assert(pos != std::numeric_limits<uint16_t>::max()); - ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id()); - auto &p = (FixedKVNode<key_t>&)*parent; - auto k = this->is_indirect() - ? this->get_intermediate_base() - : get_key(); - auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k); - if (!v.has_child()) { - this->child_pos = v.get_child_pos(); - } - return v; -} - -template <typename key_t, typename val_t> -bool BtreeNodeMapping<key_t, val_t>::is_stable() const -{ - assert(!this->parent_modified()); - assert(pos != std::numeric_limits<uint16_t>::max()); - auto &p = (FixedKVNode<key_t>&)*parent; - auto k = this->is_indirect() - ? this->get_intermediate_base() - : get_key(); - return p.is_child_stable(ctx, pos, k); -} - -template <typename key_t, typename val_t> -bool BtreeNodeMapping<key_t, val_t>::is_data_stable() const -{ - assert(!this->parent_modified()); - assert(pos != std::numeric_limits<uint16_t>::max()); - auto &p = (FixedKVNode<key_t>&)*parent; - auto k = this->is_indirect() - ? this->get_intermediate_base() - : get_key(); - return p.is_child_data_stable(ctx, pos, k); -} - -template class BtreeNodeMapping<laddr_t, paddr_t>; -template class BtreeNodeMapping<paddr_t, laddr_t>; -} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index 91751801e5d..bfd350a8bed 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -7,11 +7,12 @@ #include "crimson/common/log.h" -#include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction.h" namespace crimson::os::seastore { +class Cache; template <typename node_key_t> struct op_context_t { @@ -116,8 +117,6 @@ protected: extent_len_t len = 0; fixed_kv_node_meta_t<key_t> range; uint16_t pos = std::numeric_limits<uint16_t>::max(); - - virtual std::unique_ptr<BtreeNodeMapping> _duplicate(op_context_t<key_t>) const = 0; fixed_kv_node_meta_t<key_t> _get_pin_range() const { return range; } @@ -139,11 +138,7 @@ public: len(len), range(meta), pos(pos) - { - if (!parent->is_pending()) { - this->child_pos = {parent, pos}; - } - } + {} CachedExtentRef get_parent() const final { return parent; @@ -162,11 +157,6 @@ public: return len; } - extent_types_t get_type() const override { - ceph_abort("should never happen"); - return extent_types_t::ROOT; - } - val_t get_val() const final { if constexpr (std::is_same_v<val_t, paddr_t>) { return value.get_paddr(); @@ -180,16 +170,6 @@ public: return range.begin; } - PhysicalNodeMappingRef<key_t, val_t> duplicate() const final { - auto ret = _duplicate(ctx); - ret->range = range; - ret->value = value; - ret->parent = parent; - ret->len = len; - ret->pos = pos; - return ret; - } - bool has_been_invalidated() const final { return parent->has_been_invalidated(); } @@ -215,9 +195,6 @@ public: return unviewable; } - get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final; - bool is_stable() const final; - bool is_data_stable() const final; bool is_parent_viewable() const final { ceph_assert(parent); if (!parent->is_valid()) { diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index cb4fff32750..04ebcc7e2ca 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -32,10 +32,6 @@ inline ChildableCachedExtent* get_reserved_ptr() { template <typename T> phy_tree_root_t& get_phy_tree_root(root_t& r); -using get_child_iertr = - ::crimson::interruptible::interruptible_errorator< - typename trans_intr::condition, - get_child_ertr>; using get_phy_tree_root_node_ret = std::pair<bool, get_child_iertr::future<CachedExtentRef>>; @@ -1501,7 +1497,7 @@ private: // checking the lba child must be atomic with creating // and linking the absent child if (v.has_child()) { - return trans_intr::make_interruptible(std::move(v.get_child_fut()) + return std::move(v.get_child_fut() ).si_then([on_found=std::move(on_found), node_iter, c, parent_entry](auto child) { LOG_PREFIX(FixedKVBtree::lookup_internal_level); @@ -1571,7 +1567,7 @@ private: // checking the lba child must be atomic with creating // and linking the absent child if (v.has_child()) { - return trans_intr::make_interruptible(std::move(v.get_child_fut()) + return std::move(v.get_child_fut() ).si_then([on_found=std::move(on_found), node_iter, c, parent_entry](auto child) { LOG_PREFIX(FixedKVBtree::lookup_leaf); @@ -2126,7 +2122,7 @@ private: // checking the lba child must be atomic with creating // and linking the absent child if (v.has_child()) { - return trans_intr::make_interruptible(std::move(v.get_child_fut()) + return std::move(v.get_child_fut() ).si_then([do_merge=std::move(do_merge), &pos, donor_iter, donor_is_left, c, parent_pos](auto child) { LOG_PREFIX(FixedKVBtree::merge_level); diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 09f54a4f2d0..63e2ca38c42 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -165,6 +165,11 @@ struct FixedKVNode : ChildableCachedExtent { : ChildableCachedExtent(std::move(ptr)), children(capacity, nullptr), capacity(capacity) {} + // Must be identical with FixedKVNode(capacity, ptr) after on_fully_loaded() + explicit FixedKVNode(uint16_t capacity, extent_len_t length) + : ChildableCachedExtent(length), + children(capacity, nullptr), + capacity(capacity) {} FixedKVNode(const FixedKVNode &rhs) : ChildableCachedExtent(rhs), range(rhs.range), @@ -708,12 +713,17 @@ struct FixedKVInternalNode node_size, node_type_t>; - FixedKVInternalNode(ceph::bufferptr &&ptr) - : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)), - node_layout_t(this->get_bptr().c_str()) {} + explicit FixedKVInternalNode(ceph::bufferptr &&ptr) + : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)) { + this->set_layout_buf(this->get_bptr().c_str()); + } + // Must be identical with FixedKVInternalNode(ptr) after on_fully_loaded() + explicit FixedKVInternalNode(extent_len_t length) + : FixedKVNode<NODE_KEY>(CAPACITY, length) {} FixedKVInternalNode(const FixedKVInternalNode &rhs) - : FixedKVNode<NODE_KEY>(rhs), - node_layout_t(this->get_bptr().c_str()) {} + : FixedKVNode<NODE_KEY>(rhs) { + this->set_layout_buf(this->get_bptr().c_str()); + } bool have_children() const final { return true; @@ -985,6 +995,10 @@ struct FixedKVInternalNode pivot); } + void on_fully_loaded() final { + this->set_layout_buf(this->get_bptr().c_str()); + } + /** * Internal relative addresses on read or in memory prior to commit * are either record or block relative depending on whether this @@ -994,8 +1008,7 @@ struct FixedKVInternalNode * resolve_relative_addrs fixes up relative internal references * based on base. */ - void resolve_relative_addrs(paddr_t base) - { + void resolve_relative_addrs(paddr_t base) final { LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs); for (auto i: *this) { if (i->get_val().is_relative()) { @@ -1122,13 +1135,18 @@ struct FixedKVLeafNode node_type_t, has_children>; using base_t = FixedKVNode<NODE_KEY>; - FixedKVLeafNode(ceph::bufferptr &&ptr) - : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)), - node_layout_t(this->get_bptr().c_str()) {} + explicit FixedKVLeafNode(ceph::bufferptr &&ptr) + : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)) { + this->set_layout_buf(this->get_bptr().c_str()); + } + // Must be identical with FixedKVLeafNode(ptr) after on_fully_loaded() + explicit FixedKVLeafNode(extent_len_t length) + : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, length) {} FixedKVLeafNode(const FixedKVLeafNode &rhs) : FixedKVNode<NODE_KEY>(rhs), - node_layout_t(this->get_bptr().c_str()), - modifications(rhs.modifications) {} + modifications(rhs.modifications) { + this->set_layout_buf(this->get_bptr().c_str()); + } static constexpr bool do_has_children = has_children; // for the stable extent, modifications is always 0; @@ -1235,6 +1253,10 @@ struct FixedKVLeafNode } } + void on_fully_loaded() final { + this->set_layout_buf(this->get_bptr().c_str()); + } + void prepare_commit() final { if constexpr (has_children) { if (this->is_initial_pending()) { diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 5dcb7514ee1..86f816e1648 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -28,15 +28,6 @@ SET_SUBSYS(seastore_cache); namespace crimson::os::seastore { -std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) { - return out << "backref_entry_t{" - << ent.paddr << "~" << ent.len << ", " - << "laddr: " << ent.laddr << ", " - << "type: " << ent.type << ", " - << "seq: " << ent.seq << ", " - << "}"; -} - Cache::Cache( ExtentPlacementManager &epm) : epm(epm), @@ -44,7 +35,7 @@ Cache::Cache( "seastore_cache_lru_size")) { LOG_PREFIX(Cache::Cache); - INFO("created, lru_capacity={}B", lru.get_capacity_bytes()); + INFO("created, lru_capacity=0x{:x}B", lru.get_capacity_bytes()); register_metrics(); segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr); } @@ -63,18 +54,18 @@ Cache::retire_extent_ret Cache::retire_extent_addr( Transaction &t, paddr_t addr, extent_len_t length) { LOG_PREFIX(Cache::retire_extent_addr); - TRACET("retire {}~{}", t, addr, length); + TRACET("retire {}~0x{:x}", t, addr, length); assert(addr.is_real() && !addr.is_block_relative()); CachedExtentRef ext; auto result = t.get_extent(addr, &ext); if (result == Transaction::get_extent_ret::PRESENT) { - DEBUGT("retire {}~{} on t -- {}", t, addr, length, *ext); + DEBUGT("retire {}~0x{:x} on t -- {}", t, addr, length, *ext); t.add_to_retired_set(CachedExtentRef(&*ext)); return retire_extent_iertr::now(); } else if (result == Transaction::get_extent_ret::RETIRED) { - ERRORT("retire {}~{} failed, already retired -- {}", t, addr, length, *ext); + ERRORT("retire {}~0x{:x} failed, already retired -- {}", t, addr, length, *ext); ceph_abort(); } @@ -85,7 +76,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr( // retiring is not included by the cache hit metrics ext = query_cache(addr); if (ext) { - DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext); + DEBUGT("retire {}~0x{:x} in cache -- {}", t, addr, length, *ext); } else { // add a new placeholder to Cache ext = CachedExtent::make_cached_extent_ref< @@ -95,7 +86,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr( PLACEMENT_HINT_NULL, NULL_GENERATION, TRANS_ID_NULL); - DEBUGT("retire {}~{} as placeholder, add extent -- {}", + DEBUGT("retire {}~0x{:x} as placeholder, add extent -- {}", t, addr, length, *ext); add_extent(ext); } @@ -123,7 +114,7 @@ void Cache::retire_absent_extent_addr( PLACEMENT_HINT_NULL, NULL_GENERATION, TRANS_ID_NULL); - DEBUGT("retire {}~{} as placeholder, add extent -- {}", + DEBUGT("retire {}~0x{:x} as placeholder, add extent -- {}", t, addr, length, *ext); add_extent(ext); t.add_to_read_set(ext); @@ -172,6 +163,7 @@ void Cache::register_metrics() {extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")}, {extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")}, {extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")}, + {extent_types_t::ROOT_META, sm::label_instance("ext", "ROOT_META")}, {extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")}, {extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")}, {extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")}, @@ -1081,7 +1073,7 @@ CachedExtentRef Cache::alloc_new_extent_by_type( ) { LOG_PREFIX(Cache::alloc_new_extent_by_type); - SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}", + SUBDEBUGT(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}", t, type, length, hint, rewrite_gen_printer_t{gen}); ceph_assert(get_extent_category(type) == data_category_t::METADATA); switch (type) { @@ -1093,6 +1085,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type( case extent_types_t::LADDR_LEAF: return alloc_new_non_data_extent<lba_manager::btree::LBALeafNode>( t, length, hint, gen); + case extent_types_t::ROOT_META: + return alloc_new_non_data_extent<RootMetaBlock>( + t, length, hint, gen); case extent_types_t::ONODE_BLOCK_STAGED: return alloc_new_non_data_extent<onode::SeastoreNodeExtent>( t, length, hint, gen); @@ -1129,7 +1124,7 @@ std::vector<CachedExtentRef> Cache::alloc_new_data_extents_by_type( ) { LOG_PREFIX(Cache::alloc_new_data_extents_by_type); - SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}", + SUBDEBUGT(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}", t, type, length, hint, rewrite_gen_printer_t{gen}); ceph_assert(get_extent_category(type) == data_category_t::DATA); std::vector<CachedExtentRef> res; @@ -1344,21 +1339,39 @@ record_t Cache::prepare_record( io_stat_t retire_stat; std::vector<alloc_delta_t> alloc_deltas; alloc_delta_t rel_delta; + backref_entry_refs_t backref_entries; rel_delta.op = alloc_delta_t::op_types_t::CLEAR; for (auto &i: t.retired_set) { auto &extent = i.extent; get_by_ext(efforts.retire_by_ext, extent->get_type()).increment(extent->get_length()); retire_stat.increment(extent->get_length()); - DEBUGT("retired and remove extent -- {}", t, *extent); + DEBUGT("retired and remove extent {}~0x{:x} -- {}", + t, extent->get_paddr(), extent->get_length(), *extent); commit_retire_extent(t, extent); - if (is_backref_mapped_extent_node(extent) || - is_retired_placeholder_type(extent->get_type())) { + + // Note: commit extents and backref allocations in the same place + if (is_backref_mapped_type(extent->get_type()) || + is_retired_placeholder_type(extent->get_type())) { + DEBUGT("backref_entry free {}~0x{:x}", + t, + extent->get_paddr(), + extent->get_length()); rel_delta.alloc_blk_ranges.emplace_back( - extent->get_paddr(), - L_ADDR_NULL, - extent->get_length(), - extent->get_type()); + alloc_blk_t::create_retire( + extent->get_paddr(), + extent->get_length(), + extent->get_type())); + backref_entries.emplace_back( + backref_entry_t::create_retire( + extent->get_paddr(), + extent->get_length(), + extent->get_type())); + } else if (is_backref_node(extent->get_type())) { + remove_backref_extent(extent->get_paddr()); + } else { + ERRORT("Got unexpected extent type: {}", t, *extent); + ceph_abort("imposible"); } } alloc_deltas.emplace_back(std::move(rel_delta)); @@ -1395,27 +1408,40 @@ record_t Cache::prepare_record( if (modify_time == NULL_TIME) { modify_time = commit_time; } + laddr_t fresh_laddr; + if (i->is_logical()) { + fresh_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else if (is_lba_node(i->get_type())) { + fresh_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } else { + fresh_laddr = L_ADDR_NULL; + } record.push_back(extent_t{ i->get_type(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : (is_lba_node(i->get_type()) - ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin - : L_ADDR_NULL), + fresh_laddr, std::move(bl) }, modify_time); - if (i->is_valid() - && is_backref_mapped_extent_node(i)) { + + if (!i->is_valid()) { + continue; + } + if (is_backref_mapped_type(i->get_type())) { + laddr_t alloc_laddr; + if (i->is_logical()) { + alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else if (is_lba_node(i->get_type())) { + alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } else { + assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL); + alloc_laddr = L_ADDR_MIN; + } alloc_delta.alloc_blk_ranges.emplace_back( - i->get_paddr(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : (is_lba_node(i->get_type()) - ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin - : L_ADDR_NULL), - i->get_length(), - i->get_type()); + alloc_blk_t::create_alloc( + i->get_paddr(), + alloc_laddr, + i->get_length(), + i->get_type())); } } @@ -1426,14 +1452,20 @@ record_t Cache::prepare_record( get_by_ext(efforts.fresh_ool_by_ext, i->get_type()).increment(i->get_length()); i->prepare_commit(); - if (is_backref_mapped_extent_node(i)) { + if (is_backref_mapped_type(i->get_type())) { + laddr_t alloc_laddr; + if (i->is_logical()) { + alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else { + assert(is_lba_node(i->get_type())); + alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } alloc_delta.alloc_blk_ranges.emplace_back( - i->get_paddr(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin, - i->get_length(), - i->get_type()); + alloc_blk_t::create_alloc( + i->get_paddr(), + alloc_laddr, + i->get_length(), + i->get_type())); } } @@ -1451,19 +1483,57 @@ record_t Cache::prepare_record( i->state = CachedExtent::extent_state_t::CLEAN; assert(i->is_logical()); i->clear_modified_region(); - touch_extent(*i, &trans_src); + touch_extent(*i, &trans_src, t.get_cache_hint()); DEBUGT("inplace rewrite ool block is commmitted -- {}", t, *i); } + auto existing_stats = t.get_existing_block_stats(); + DEBUGT("total existing blocks num: {}, exist clean num: {}, " + "exist mutation pending num: {}", + t, + existing_stats.valid_num, + existing_stats.clean_num, + existing_stats.mutated_num); for (auto &i: t.existing_block_list) { - if (i->is_valid()) { - alloc_delta.alloc_blk_ranges.emplace_back( - i->get_paddr(), + assert(is_logical_type(i->get_type())); + if (!i->is_valid()) { + continue; + } + + if (i->is_exist_clean()) { + i->state = CachedExtent::extent_state_t::CLEAN; + } else { + assert(i->is_exist_mutation_pending()); + // i->state must become DIRTY in complete_commit() + } + + // exist mutation pending extents must be in t.mutated_block_list + add_extent(i); + const auto t_src = t.get_src(); + if (i->is_dirty()) { + add_to_dirty(i, &t_src); + } else { + touch_extent(*i, &t_src, t.get_cache_hint()); + } + + alloc_delta.alloc_blk_ranges.emplace_back( + alloc_blk_t::create_alloc( + i->get_paddr(), i->cast<LogicalCachedExtent>()->get_laddr(), i->get_length(), - i->get_type()); - } + i->get_type())); + + // Note: commit extents and backref allocations in the same place + // Note: remapping is split into 2 steps, retire and alloc, they must be + // committed atomically together + backref_entries.emplace_back( + backref_entry_t::create_alloc( + i->get_paddr(), + i->cast<LogicalCachedExtent>()->get_laddr(), + i->get_length(), + i->get_type())); } + alloc_deltas.emplace_back(std::move(alloc_delta)); for (auto b : alloc_deltas) { @@ -1517,6 +1587,9 @@ record_t Cache::prepare_record( record.push_back(std::move(delta)); } + apply_backref_mset(backref_entries); + t.set_backref_entries(std::move(backref_entries)); + ceph_assert(t.get_fresh_block_stats().num == t.inline_block_list.size() + t.ool_block_list.size() + @@ -1616,26 +1689,35 @@ record_t Cache::prepare_record( return record; } -void Cache::backref_batch_update( - std::vector<backref_entry_ref> &&list, - const journal_seq_t &seq) +void Cache::apply_backref_byseq( + backref_entry_refs_t&& backref_entries, + const journal_seq_t& seq) { - LOG_PREFIX(Cache::backref_batch_update); - DEBUG("inserting {} entries at {}", list.size(), seq); - ceph_assert(seq != JOURNAL_SEQ_NULL); - - for (auto &ent : list) { - backref_entry_mset.insert(*ent); + LOG_PREFIX(Cache::apply_backref_byseq); + DEBUG("backref_entry apply {} entries at {}", + backref_entries.size(), seq); + assert(seq != JOURNAL_SEQ_NULL); + if (backref_entries.empty()) { + return; } - - auto iter = backref_entryrefs_by_seq.find(seq); - if (iter == backref_entryrefs_by_seq.end()) { - backref_entryrefs_by_seq.emplace(seq, std::move(list)); + if (backref_entryrefs_by_seq.empty()) { + backref_entryrefs_by_seq.insert( + backref_entryrefs_by_seq.end(), + {seq, std::move(backref_entries)}); + return; + } + auto last = backref_entryrefs_by_seq.rbegin(); + assert(last->first <= seq); + if (last->first == seq) { + last->second.insert( + last->second.end(), + std::make_move_iterator(backref_entries.begin()), + std::make_move_iterator(backref_entries.end())); } else { - iter->second.insert( - iter->second.end(), - std::make_move_iterator(list.begin()), - std::make_move_iterator(list.end())); + assert(last->first < seq); + backref_entryrefs_by_seq.insert( + backref_entryrefs_by_seq.end(), + {seq, std::move(backref_entries)}); } } @@ -1648,7 +1730,7 @@ void Cache::complete_commit( SUBTRACET(seastore_t, "final_block_start={}, start_seq={}", t, final_block_start, start_seq); - std::vector<backref_entry_ref> backref_list; + backref_entry_refs_t backref_entries; t.for_each_finalized_fresh_block([&](const CachedExtentRef &i) { if (!i->is_valid()) { return; @@ -1677,24 +1759,30 @@ void Cache::complete_commit( add_extent(i); assert(!i->is_dirty()); const auto t_src = t.get_src(); - touch_extent(*i, &t_src); + touch_extent(*i, &t_src, t.get_cache_hint()); epm.commit_space_used(i->get_paddr(), i->get_length()); - if (is_backref_mapped_extent_node(i)) { - DEBUGT("backref_list new {} len {}", + + // Note: commit extents and backref allocations in the same place + if (is_backref_mapped_type(i->get_type())) { + DEBUGT("backref_entry alloc {}~0x{:x}", t, i->get_paddr(), i->get_length()); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( + laddr_t alloc_laddr; + if (i->is_logical()) { + alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr(); + } else if (is_lba_node(i->get_type())) { + alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin; + } else { + assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL); + alloc_laddr = L_ADDR_MIN; + } + backref_entries.emplace_back( + backref_entry_t::create_alloc( i->get_paddr(), - i->is_logical() - ? i->cast<LogicalCachedExtent>()->get_laddr() - : (is_lba_node(i->get_type()) - ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin - : L_ADDR_NULL), + alloc_laddr, i->get_length(), - i->get_type(), - start_seq)); + i->get_type())); } else if (is_backref_node(i->get_type())) { add_backref_extent( i->get_paddr(), @@ -1731,9 +1819,10 @@ void Cache::complete_commit( epm.mark_space_free(extent->get_paddr(), extent->get_length()); } for (auto &i: t.existing_block_list) { - if (i->is_valid()) { - epm.mark_space_used(i->get_paddr(), i->get_length()); + if (!i->is_valid()) { + continue; } + epm.mark_space_used(i->get_paddr(), i->get_length()); } for (auto &i: t.mutated_block_list) { @@ -1747,64 +1836,10 @@ void Cache::complete_commit( for (auto &i: t.retired_set) { auto &extent = i.extent; extent->dirty_from_or_retired_at = start_seq; - if (is_backref_mapped_extent_node(extent) || - is_retired_placeholder_type(extent->get_type())) { - DEBUGT("backref_list free {} len {}", - t, - extent->get_paddr(), - extent->get_length()); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( - extent->get_paddr(), - L_ADDR_NULL, - extent->get_length(), - extent->get_type(), - start_seq)); - } else if (is_backref_node(extent->get_type())) { - remove_backref_extent(extent->get_paddr()); - } else { - ERRORT("{}", t, *extent); - ceph_abort("not possible"); - } } - auto existing_stats = t.get_existing_block_stats(); - DEBUGT("total existing blocks num: {}, exist clean num: {}, " - "exist mutation pending num: {}", - t, - existing_stats.valid_num, - existing_stats.clean_num, - existing_stats.mutated_num); - for (auto &i: t.existing_block_list) { - if (i->is_valid()) { - if (i->is_exist_clean()) { - i->state = CachedExtent::extent_state_t::CLEAN; - } else { - assert(i->state == CachedExtent::extent_state_t::DIRTY); - } - DEBUGT("backref_list new existing {} len {}", - t, - i->get_paddr(), - i->get_length()); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( - i->get_paddr(), - i->cast<LogicalCachedExtent>()->get_laddr(), - i->get_length(), - i->get_type(), - start_seq)); - add_extent(i); - const auto t_src = t.get_src(); - if (i->is_dirty()) { - add_to_dirty(i, &t_src); - } else { - touch_extent(*i, &t_src); - } - } - } - if (!backref_list.empty()) { - backref_batch_update(std::move(backref_list), start_seq); - } + apply_backref_byseq(t.move_backref_entries(), start_seq); + commit_backref_entries(std::move(backref_entries), start_seq); for (auto &i: t.pre_alloc_list) { if (!i->is_valid()) { @@ -1822,7 +1857,7 @@ void Cache::init() remove_extent(root, nullptr); root = nullptr; } - root = new RootBlock(); + root = CachedExtent::make_cached_extent_ref<RootBlock>(); root->init(CachedExtent::extent_state_t::CLEAN, P_ADDR_ROOT, PLACEMENT_HINT_NULL, @@ -1927,25 +1962,18 @@ Cache::replay_delta( alloc_delta_t alloc_delta; decode(alloc_delta, delta.bl); - std::vector<backref_entry_ref> backref_list; + backref_entry_refs_t backref_entries; for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) { if (alloc_blk.paddr.is_relative()) { assert(alloc_blk.paddr.is_record_relative()); alloc_blk.paddr = record_base.add_relative(alloc_blk.paddr); } - DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}", + DEBUG("replay alloc_blk {}~0x{:x} {}, journal_seq: {}", alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq); - backref_list.emplace_back( - std::make_unique<backref_entry_t>( - alloc_blk.paddr, - alloc_blk.laddr, - alloc_blk.len, - alloc_blk.type, - journal_seq)); - } - if (!backref_list.empty()) { - backref_batch_update(std::move(backref_list), journal_seq); + backref_entries.emplace_back( + backref_entry_t::create(alloc_blk)); } + commit_backref_entries(std::move(backref_entries), journal_seq); return replay_delta_ertr::make_ready_future<std::pair<bool, CachedExtentRef>>( std::make_pair(true, nullptr)); } @@ -1998,8 +2026,9 @@ Cache::replay_delta( [](CachedExtent &) {}, [this](CachedExtent &ext) { // replay is not included by the cache hit metrics - touch_extent(ext, nullptr); - }) : + touch_extent(ext, nullptr, CACHE_HINT_TOUCH); + }, + nullptr) : _get_extent_if_cached( delta.paddr) ).handle_error( @@ -2162,7 +2191,8 @@ Cache::do_get_caching_extent_by_type( laddr_t laddr, extent_len_t length, extent_init_func_t &&extent_init_func, - extent_init_func_t &&on_cache) + extent_init_func_t &&on_cache, + const Transaction::src_t* p_src) { return [=, this, extent_init_func=std::move(extent_init_func)]() mutable { switch (type) { @@ -2171,55 +2201,61 @@ Cache::do_get_caching_extent_by_type( return get_extent_ertr::make_ready_future<CachedExtentRef>(); case extent_types_t::BACKREF_INTERNAL: return do_get_caching_extent<backref::BackrefInternalNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::BACKREF_LEAF: return do_get_caching_extent<backref::BackrefLeafNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::LADDR_INTERNAL: return do_get_caching_extent<lba_manager::btree::LBAInternalNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::LADDR_LEAF: return do_get_caching_extent<lba_manager::btree::LBALeafNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); + case extent_types_t::ROOT_META: + return do_get_caching_extent<RootMetaBlock>( + offset, length, std::move(extent_init_func), std::move(on_cache), p_src + ).safe_then([](auto extent) { + return CachedExtentRef(extent.detach(), false /* add_ref */); + }); case extent_types_t::OMAP_INNER: return do_get_caching_extent<omap_manager::OMapInnerNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::OMAP_LEAF: return do_get_caching_extent<omap_manager::OMapLeafNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::COLL_BLOCK: return do_get_caching_extent<collection_manager::CollectionNode>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::ONODE_BLOCK_STAGED: return do_get_caching_extent<onode::SeastoreNodeExtent>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::OBJECT_DATA_BLOCK: return do_get_caching_extent<ObjectDataBlock>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); @@ -2228,13 +2264,13 @@ Cache::do_get_caching_extent_by_type( return get_extent_ertr::make_ready_future<CachedExtentRef>(); case extent_types_t::TEST_BLOCK: return do_get_caching_extent<TestBlock>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); case extent_types_t::TEST_BLOCK_PHYSICAL: return do_get_caching_extent<TestBlockPhysical>( - offset, length, std::move(extent_init_func), std::move(on_cache) + offset, length, std::move(extent_init_func), std::move(on_cache), p_src ).safe_then([](auto extent) { return CachedExtentRef(extent.detach(), false /* add_ref */); }); diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index dba3610e95f..a239b861726 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -3,13 +3,13 @@ #pragma once -#include <iostream> - #include "seastar/core/shared_future.hh" #include "include/buffer.h" #include "crimson/common/errorator.h" +#include "crimson/common/errorator-loop.h" +#include "crimson/os/seastore/backref_entry.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/extent_placement_manager.h" #include "crimson/os/seastore/logging.h" @@ -37,86 +37,6 @@ class FixedKVBtree; class BackrefManager; class SegmentProvider; -struct backref_entry_t { - backref_entry_t( - const paddr_t paddr, - const laddr_t laddr, - const extent_len_t len, - const extent_types_t type, - const journal_seq_t seq) - : paddr(paddr), - laddr(laddr), - len(len), - type(type), - seq(seq) - {} - backref_entry_t(alloc_blk_t alloc_blk) - : paddr(alloc_blk.paddr), - laddr(alloc_blk.laddr), - len(alloc_blk.len), - type(alloc_blk.type) - {} - paddr_t paddr = P_ADDR_NULL; - laddr_t laddr = L_ADDR_NULL; - extent_len_t len = 0; - extent_types_t type = - extent_types_t::ROOT; - journal_seq_t seq; - friend bool operator< ( - const backref_entry_t &l, - const backref_entry_t &r) { - return l.paddr < r.paddr; - } - friend bool operator> ( - const backref_entry_t &l, - const backref_entry_t &r) { - return l.paddr > r.paddr; - } - friend bool operator== ( - const backref_entry_t &l, - const backref_entry_t &r) { - return l.paddr == r.paddr; - } - - using set_hook_t = - boost::intrusive::set_member_hook< - boost::intrusive::link_mode< - boost::intrusive::auto_unlink>>; - set_hook_t backref_set_hook; - using backref_set_member_options = boost::intrusive::member_hook< - backref_entry_t, - set_hook_t, - &backref_entry_t::backref_set_hook>; - using multiset_t = boost::intrusive::multiset< - backref_entry_t, - backref_set_member_options, - boost::intrusive::constant_time_size<false>>; - - struct cmp_t { - using is_transparent = paddr_t; - bool operator()( - const backref_entry_t &l, - const backref_entry_t &r) const { - return l.paddr < r.paddr; - } - bool operator()(const paddr_t l, const backref_entry_t &r) const { - return l < r.paddr; - } - bool operator()(const backref_entry_t &l, const paddr_t r) const { - return l.paddr < r; - } - }; -}; - -std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent); - -using backref_entry_ref = std::unique_ptr<backref_entry_t>; -using backref_entry_mset_t = backref_entry_t::multiset_t; -using backref_entry_refs_t = std::vector<backref_entry_ref>; -using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>; -using backref_entry_query_set_t = std::set< - backref_entry_t, backref_entry_t::cmp_t>; - /** * Cache * @@ -204,6 +124,7 @@ public: TransactionRef create_transaction( Transaction::src_t src, const char* name, + cache_hint_t cache_hint, bool is_weak) { LOG_PREFIX(Cache::create_transaction); @@ -217,7 +138,8 @@ public: [this](Transaction& t) { return on_transaction_destruct(t); }, - ++next_id + ++next_id, + cache_hint ); SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}", *ret, name, src, is_weak); @@ -323,8 +245,9 @@ public: CachedExtentRef>(ret); }); } else { - SUBDEBUGT(seastore_cache, "{} {} is present on t -- {}" - " without being fully loaded", t, type, offset, *ret); + SUBDEBUGT(seastore_cache, + "{} {} is present on t -- {} without fully loaded", + t, type, offset, *ret); return get_extent_if_cached_iertr::make_ready_future< CachedExtentRef>(); } @@ -354,8 +277,8 @@ public: if (!ret->is_fully_loaded()) { // ignore non-full extent - SUBDEBUGT(seastore_cache, "{} {} is present without " - "being fully loaded", t, type, offset); + SUBDEBUGT(seastore_cache, + "{} {} is present without fully loaded", t, type, offset); return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>(); } @@ -363,7 +286,7 @@ public: SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}", t, type, offset, *ret); t.add_to_read_set(ret); - touch_extent(*ret, &t_src); + touch_extent(*ret, &t_src, t.get_cache_hint()); return ret->wait_io().then([ret] { return get_extent_if_cached_iertr::make_ready_future< CachedExtentRef>(ret); @@ -394,39 +317,37 @@ public: extent_len_t length) { CachedExtentRef ret; LOG_PREFIX(Cache::get_caching_extent); + const auto t_src = t.get_src(); auto result = t.get_extent(offset, &ret); if (result == Transaction::get_extent_ret::RETIRED) { - SUBERRORT(seastore_cache, "{} {}~{} is retired on t -- {}", + SUBERRORT(seastore_cache, "{} {}~0x{:x} is retired on t -- {}", t, T::TYPE, offset, length, *ret); ceph_abort("impossible"); } else if (result == Transaction::get_extent_ret::PRESENT) { + assert(ret->get_length() == length); if (ret->is_fully_loaded()) { - SUBTRACET(seastore_cache, "{} {}~{} is present on t -- {}", + SUBTRACET(seastore_cache, "{} {}~0x{:x} is present on t -- {}", t, T::TYPE, offset, length, *ret); return ret->wait_io().then([ret] { return seastar::make_ready_future<TCachedExtentRef<T>>( ret->cast<T>()); }); } else { - assert(!ret->is_mutable()); - SUBDEBUGT(seastore_cache, "{} {}~{} is present on t without been \ - fully loaded, reading ... {}", t, T::TYPE, offset, length, *ret); - auto bp = alloc_cache_buf(ret->get_length()); - ret->set_bptr(std::move(bp)); - return read_extent<T>( - ret->cast<T>()); + SUBDEBUGT(seastore_cache, + "{} {}~0x{:x} is present on t without fully loaded, reading ... -- {}", + t, T::TYPE, offset, length, *ret); + return do_read_extent_maybe_partial<T>(ret->cast<T>(), 0, length, &t_src); } } else { - SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", + SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...", t, T::TYPE, offset, length); - auto f = [&t, this](CachedExtent &ext) { + auto f = [&t, this, t_src](CachedExtent &ext) { t.add_to_read_set(CachedExtentRef(&ext)); - const auto t_src = t.get_src(); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent<T>( - offset, length, [](T &){}, std::move(f)) + offset, length, [](T &){}, std::move(f), &t_src) ); } } @@ -435,12 +356,15 @@ public: * get_absent_extent * * The extent in query is supposed to be absent in Cache. + * partially load buffer from partial_off~partial_len if not present. */ template <typename T, typename Func> get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent( Transaction &t, paddr_t offset, extent_len_t length, + extent_len_t partial_off, + extent_len_t partial_len, Func &&extent_init_func) { CachedExtentRef ret; LOG_PREFIX(Cache::get_absent_extent); @@ -453,13 +377,13 @@ public: } #endif - SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", + SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...", t, T::TYPE, offset, length); - auto f = [&t, this](CachedExtent &ext) { + const auto t_src = t.get_src(); + auto f = [&t, this, t_src](CachedExtent &ext) { // FIXME: assert(ext.is_stable_clean()); assert(ext.is_stable()); assert(T::TYPE == ext.get_type()); - const auto t_src = t.get_src(); extent_access_stats_t& access_stats = get_by_ext( get_by_src(stats.access_by_src_ext, t_src), T::TYPE); @@ -467,11 +391,12 @@ public: ++stats.access.s.load_absent; t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent<T>( - offset, length, std::forward<Func>(extent_init_func), std::move(f)) + offset, length, partial_off, partial_len, + std::forward<Func>(extent_init_func), std::move(f), &t_src) ); } @@ -495,6 +420,16 @@ public: return get_absent_extent<T>(t, offset, length, [](T &){}); } + template <typename T, typename Func> + get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent( + Transaction &t, + paddr_t offset, + extent_len_t length, + Func &&extent_init_func) { + return get_absent_extent<T>(t, offset, length, 0, length, + std::forward<Func>(extent_init_func)); + } + bool is_viewable_extent_stable( Transaction &t, CachedExtentRef extent) @@ -513,8 +448,7 @@ public: return view->is_data_stable(); } - using get_extent_ertr = base_ertr; - get_extent_ertr::future<CachedExtentRef> + get_extent_iertr::future<CachedExtentRef> get_extent_viewable_by_trans( Transaction &t, CachedExtentRef extent) @@ -539,7 +473,7 @@ public: if (p_extent->is_mutable()) { assert(p_extent->is_fully_loaded()); assert(!p_extent->is_pending_io()); - return get_extent_ertr::make_ready_future<CachedExtentRef>( + return get_extent_iertr::make_ready_future<CachedExtentRef>( CachedExtentRef(p_extent)); } else { assert(p_extent->is_exist_clean()); @@ -555,7 +489,7 @@ public: ++access_stats.cache_lru; ++stats.access.s.cache_lru; } - touch_extent(*p_extent, &t_src); + touch_extent(*p_extent, &t_src, t.get_cache_hint()); } else { if (p_extent->is_dirty()) { ++access_stats.trans_dirty; @@ -574,7 +508,7 @@ public: if (extent->is_mutable()) { assert(extent->is_fully_loaded()); assert(!extent->is_pending_io()); - return get_extent_ertr::make_ready_future<CachedExtentRef>(extent); + return get_extent_iertr::make_ready_future<CachedExtentRef>(extent); } else { assert(extent->is_exist_clean()); p_extent = extent.get(); @@ -583,40 +517,66 @@ public: // user should not see RETIRED_PLACEHOLDER extents ceph_assert(!is_retired_placeholder_type(p_extent->get_type())); - if (!p_extent->is_fully_loaded()) { - assert(!p_extent->is_mutable()); - ++access_stats.load_present; - ++stats.access.s.load_present; - LOG_PREFIX(Cache::get_extent_viewable_by_trans); - SUBDEBUG(seastore_cache, - "{} {}~{} is present without been fully loaded, reading ... -- {}", - p_extent->get_type(), p_extent->get_paddr(),p_extent->get_length(), - *p_extent); - auto bp = alloc_cache_buf(p_extent->get_length()); - p_extent->set_bptr(std::move(bp)); - return read_extent<CachedExtent>(CachedExtentRef(p_extent)); - } - return p_extent->wait_io( - ).then([p_extent] { - return get_extent_ertr::make_ready_future<CachedExtentRef>( + // for logical extents, handle partial load in TM::read_pin(), + // also see read_extent_maybe_partial() and get_absent_extent() + assert(is_logical_type(p_extent->get_type()) || + p_extent->is_fully_loaded()); + + return trans_intr::make_interruptible( + p_extent->wait_io() + ).then_interruptible([p_extent] { + return get_extent_iertr::make_ready_future<CachedExtentRef>( CachedExtentRef(p_extent)); }); } template <typename T> - using read_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>; - - template <typename T> - read_extent_ret<T> get_extent_viewable_by_trans( + get_extent_iertr::future<TCachedExtentRef<T>> + get_extent_viewable_by_trans( Transaction &t, TCachedExtentRef<T> extent) { return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get()) - ).safe_then([](auto p_extent) { + ).si_then([](auto p_extent) { return p_extent->template cast<T>(); }); } + // wait extent io or do partial reads + template <typename T> + get_extent_iertr::future<TCachedExtentRef<T>> + read_extent_maybe_partial( + Transaction &t, + TCachedExtentRef<T> extent, + extent_len_t partial_off, + extent_len_t partial_len) { + assert(is_logical_type(extent->get_type())); + if (!extent->is_range_loaded(partial_off, partial_len)) { + LOG_PREFIX(Cache::read_extent_maybe_partial); + SUBDEBUGT(seastore_cache, + "{} {}~0x{:x} is present on t without range 0x{:x}~0x{:x}, reading ... -- {}", + t, extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + const auto t_src = t.get_src(); + extent_access_stats_t& access_stats = get_by_ext( + get_by_src(stats.access_by_src_ext, t_src), + extent->get_type()); + ++access_stats.load_present; + ++stats.access.s.load_present; + return trans_intr::make_interruptible( + do_read_extent_maybe_partial( + std::move(extent), partial_off, partial_len, &t_src)); + } else { + // TODO(implement fine-grained-wait): + // the range might be already loaded, but we don't know + return trans_intr::make_interruptible( + extent->wait_io() + ).then_interruptible([extent] { + return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>(extent); + }); + } + } + extent_len_t get_block_size() const { return epm.get_block_size(); } @@ -628,54 +588,122 @@ public: } private: + using get_extent_ertr = base_ertr; + template <typename T> + using read_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>; + /// Implements exclusive call to read_extent() for the extent + template <typename T> + read_extent_ret<T> do_read_extent_maybe_partial( + TCachedExtentRef<T>&& extent, + extent_len_t partial_off, + extent_len_t partial_len, + const Transaction::src_t* p_src) + { + LOG_PREFIX(Cache::do_read_extent_maybe_partial); + // They must be atomic: + // 1. checking missing range and wait io + // 2. checking missing range and read + // because the extents in Caches can be accessed concurrently + // + // TODO(implement fine-grained-wait) + assert(!extent->is_range_loaded(partial_off, partial_len)); + assert(!extent->is_mutable()); + if (extent->is_pending_io()) { + std::optional<Transaction::src_t> src; + if (p_src) { + src = *p_src; + } + auto* p_extent = extent.get(); + return p_extent->wait_io( + ).then([extent=std::move(extent), partial_off, partial_len, this, FNAME, src]() mutable + -> read_extent_ret<T> { + if (extent->is_range_loaded(partial_off, partial_len)) { + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} got range 0x{:x}~0x{:x} ... -- {}", + extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + // we don't know whether the target range is loading or not + if (extent->is_pending_io()) { + auto* p_extent = extent.get(); + return p_extent->wait_io( + ).then([extent=std::move(extent)]() mutable { + return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent)); + }); + } else { + return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent)); + } + } else { // range not loaded + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} without range 0x{:x}~0x{:x} ... -- {}", + extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + Transaction::src_t* p_src = (src.has_value() ? &src.value() : nullptr); + return do_read_extent_maybe_partial( + std::move(extent), partial_off, partial_len, p_src); + } + }); + } else { + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} is not pending without range 0x{:x}~0x{:x}, reading ... -- {}", + extent->get_type(), extent->get_paddr(), extent->get_length(), + partial_off, partial_len, *extent); + return read_extent<T>( + std::move(extent), partial_off, partial_len, p_src); + } + } + /** * do_get_caching_extent * * returns ref to extent at offset~length of type T either from * - extent_set if already in cache * - disk + * only load partial_off~partial_len */ using src_ext_t = std::pair<Transaction::src_t, extent_types_t>; template <typename T, typename Func, typename OnCache> read_extent_ret<T> do_get_caching_extent( paddr_t offset, ///< [in] starting addr extent_len_t length, ///< [in] length + extent_len_t partial_off, ///< [in] offset of piece in extent + extent_len_t partial_len, ///< [in] length of piece in extent Func &&extent_init_func, ///< [in] init func for extent - OnCache &&on_cache + OnCache &&on_cache, + const Transaction::src_t* p_src ) { LOG_PREFIX(Cache::do_get_caching_extent); auto cached = query_cache(offset); if (!cached) { - auto ret = CachedExtent::make_cached_extent_ref<T>( - alloc_cache_buf(length)); + // partial read + TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length); ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, offset, PLACEMENT_HINT_NULL, NULL_GENERATION, TRANS_ID_NULL); SUBDEBUG(seastore_cache, - "{} {}~{} is absent, add extent and reading ... -- {}", - T::TYPE, offset, length, *ret); + "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); add_extent(ret); // touch_extent() should be included in on_cache on_cache(*ret); extent_init_func(*ret); return read_extent<T>( - std::move(ret)); + std::move(ret), partial_off, partial_len, p_src); } // extent PRESENT in cache if (is_retired_placeholder_type(cached->get_type())) { - auto ret = CachedExtent::make_cached_extent_ref<T>( - alloc_cache_buf(length)); + // partial read + TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length); ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, offset, PLACEMENT_HINT_NULL, NULL_GENERATION, TRANS_ID_NULL); SUBDEBUG(seastore_cache, - "{} {}~{} is absent(placeholder), reading ... -- {}", - T::TYPE, offset, length, *ret); + "{} {}~0x{:x} is absent(placeholder), add extent and reading range 0x{:x}~0x{:x} ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); extents_index.replace(*ret, *cached); on_cache(*ret); @@ -688,34 +716,41 @@ private: cached->state = CachedExtent::extent_state_t::INVALID; extent_init_func(*ret); return read_extent<T>( - std::move(ret)); - } else if (!cached->is_fully_loaded()) { - auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get())); - on_cache(*ret); - SUBDEBUG(seastore_cache, - "{} {}~{} is present without been fully loaded, reading ... -- {}", - T::TYPE, offset, length, *ret); - auto bp = alloc_cache_buf(length); - ret->set_bptr(std::move(bp)); - return read_extent<T>( - std::move(ret)); - } else { + std::move(ret), partial_off, partial_len, p_src); + } + + auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get())); + on_cache(*ret); + if (ret->is_range_loaded(partial_off, partial_len)) { SUBTRACE(seastore_cache, - "{} {}~{} is present in cache -- {}", - T::TYPE, offset, length, *cached); - auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get())); - on_cache(*ret); - return ret->wait_io( - ).then([ret=std::move(ret)]() mutable - -> read_extent_ret<T> { + "{} {}~0x{:x} is present with range 0x{:x}~0x{:x} ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); + return ret->wait_io().then([ret] { // ret may be invalid, caller must check - return read_extent_ret<T>( - get_extent_ertr::ready_future_marker{}, - std::move(ret)); + return seastar::make_ready_future<TCachedExtentRef<T>>(ret); }); + } else { + SUBDEBUG(seastore_cache, + "{} {}~0x{:x} is present without range 0x{:x}~0x{:x}, reading ... -- {}", + T::TYPE, offset, length, partial_off, partial_len, *ret); + return do_read_extent_maybe_partial( + std::move(ret), partial_off, partial_len, p_src); } } + template <typename T, typename Func, typename OnCache> + read_extent_ret<T> do_get_caching_extent( + paddr_t offset, ///< [in] starting addr + extent_len_t length, ///< [in] length + Func &&extent_init_func, ///< [in] init func for extent + OnCache &&on_cache, + const Transaction::src_t* p_src + ) { + return do_get_caching_extent<T>(offset, length, 0, length, + std::forward<Func>(extent_init_func), + std::forward<OnCache>(on_cache), + p_src); + } // This is a workaround std::move_only_function not being available, // not really worth generalizing at this time. @@ -751,8 +786,8 @@ private: laddr_t laddr, extent_len_t length, extent_init_func_t &&extent_init_func, - extent_init_func_t &&on_cache - ); + extent_init_func_t &&on_cache, + const Transaction::src_t* p_src); /** * get_caching_extent_by_type @@ -774,40 +809,39 @@ private: extent_init_func_t &&extent_init_func ) { LOG_PREFIX(Cache::get_caching_extent_by_type); + const auto t_src = t.get_src(); CachedExtentRef ret; auto status = t.get_extent(offset, &ret); if (status == Transaction::get_extent_ret::RETIRED) { - SUBERRORT(seastore_cache, "{} {}~{} {} is retired on t -- {}", + SUBERRORT(seastore_cache, "{} {}~0x{:x} {} is retired on t -- {}", t, type, offset, length, laddr, *ret); ceph_abort("impossible"); } else if (status == Transaction::get_extent_ret::PRESENT) { + assert(ret->get_length() == length); if (ret->is_fully_loaded()) { - SUBTRACET(seastore_cache, "{} {}~{} {} is present on t -- {}", + SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is present on t -- {}", t, type, offset, length, laddr, *ret); return ret->wait_io().then([ret] { return seastar::make_ready_future<CachedExtentRef>(ret); }); } else { - assert(!ret->is_mutable()); - SUBDEBUGT(seastore_cache, "{} {}~{} {} is present on t without been \ - fully loaded, reading ...", t, type, offset, length, laddr); - auto bp = alloc_cache_buf(ret->get_length()); - ret->set_bptr(std::move(bp)); - return read_extent<CachedExtent>( - std::move(ret)); + SUBDEBUGT(seastore_cache, + "{} {}~0x{:x} {} is present on t without fully loaded, reading ... -- {}", + t, type, offset, length, laddr, *ret); + return do_read_extent_maybe_partial<CachedExtent>( + std::move(ret), 0, length, &t_src); } } else { - SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...", + SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...", t, type, offset, length, laddr); - auto f = [&t, this](CachedExtent &ext) { + auto f = [&t, this, t_src](CachedExtent &ext) { t.add_to_read_set(CachedExtentRef(&ext)); - const auto t_src = t.get_src(); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent_by_type( type, offset, laddr, length, - std::move(extent_init_func), std::move(f)) + std::move(extent_init_func), std::move(f), &t_src) ); } } @@ -831,12 +865,12 @@ private: } #endif - SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...", + SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...", t, type, offset, length, laddr); - auto f = [&t, this](CachedExtent &ext) { + const auto t_src = t.get_src(); + auto f = [&t, this, t_src](CachedExtent &ext) { // FIXME: assert(ext.is_stable_clean()); assert(ext.is_stable()); - const auto t_src = t.get_src(); extent_access_stats_t& access_stats = get_by_ext( get_by_src(stats.access_by_src_ext, t_src), ext.get_type()); @@ -844,12 +878,12 @@ private: ++stats.access.s.load_absent; t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext, &t_src); + touch_extent(ext, &t_src, t.get_cache_hint()); }; return trans_intr::make_interruptible( do_get_caching_extent_by_type( type, offset, laddr, length, - std::move(extent_init_func), std::move(f)) + std::move(extent_init_func), std::move(f), &t_src) ); } @@ -871,7 +905,7 @@ private: for (auto it = start_iter; it != end_iter; it++) { - res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq); + res.emplace(it->paddr, it->laddr, it->len, it->type); } return res; } @@ -970,7 +1004,7 @@ public: #endif ) { LOG_PREFIX(Cache::alloc_new_non_data_extent); - SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}", + SUBTRACET(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}", t, T::TYPE, length, hint, rewrite_gen_printer_t{gen}); #ifdef UNIT_TESTS_BUILT auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen, epaddr); @@ -978,7 +1012,8 @@ public: auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen); #endif if (!result) { - return nullptr; + SUBERRORT(seastore_cache, "insufficient space", t); + std::rethrow_exception(crimson::ct_error::enospc::exception_ptr()); } auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result->bp)); ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING, @@ -988,7 +1023,7 @@ public: t.get_trans_id()); t.add_fresh_extent(ret); SUBDEBUGT(seastore_cache, - "allocated {} {}B extent at {}, hint={}, gen={} -- {}", + "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}", t, T::TYPE, length, result->paddr, hint, rewrite_gen_printer_t{result->gen}, *ret); return ret; @@ -1012,13 +1047,17 @@ public: #endif ) { LOG_PREFIX(Cache::alloc_new_data_extents); - SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}", + SUBTRACET(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}", t, T::TYPE, length, hint, rewrite_gen_printer_t{gen}); #ifdef UNIT_TESTS_BUILT auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen, epaddr); #else auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen); #endif + if (results.empty()) { + SUBERRORT(seastore_cache, "insufficient space", t); + std::rethrow_exception(crimson::ct_error::enospc::exception_ptr()); + } std::vector<TCachedExtentRef<T>> extents; for (auto &result : results) { auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp)); @@ -1029,7 +1068,7 @@ public: t.get_trans_id()); t.add_fresh_extent(ret); SUBDEBUGT(seastore_cache, - "allocated {} {}B extent at {}, hint={}, gen={} -- {}", + "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}", t, T::TYPE, length, result.paddr, hint, rewrite_gen_printer_t{result.gen}, *ret); extents.emplace_back(std::move(ret)); @@ -1063,7 +1102,7 @@ public: // (relative/temp) paddr, so make extent directly ext = CachedExtent::make_cached_extent_ref<T>(std::move(nbp)); } else { - ext = CachedExtent::make_placeholder_cached_extent_ref<T>(remap_length); + ext = CachedExtent::make_cached_extent_ref<T>(remap_length); } ext->init(CachedExtent::extent_state_t::EXIST_CLEAN, @@ -1075,7 +1114,7 @@ public: auto extent = ext->template cast<T>(); extent->set_laddr(remap_laddr); t.add_fresh_extent(ext); - SUBTRACET(seastore_cache, "allocated {} {}B, hint={}, has ptr? {} -- {}", + SUBTRACET(seastore_cache, "allocated {} 0x{:x}B, hint={}, has ptr? {} -- {}", t, T::TYPE, remap_length, remap_laddr, original_bptr.has_value(), *extent); return extent; } @@ -1218,7 +1257,7 @@ public: { LOG_PREFIX(Cache::init_cached_extents); SUBINFOT(seastore_cache, - "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}", + "start with {}(0x{:x}B) extents, {} dirty, dirty_from={}, alloc_from={}", t, extents_index.size(), extents_index.get_bytes(), @@ -1261,7 +1300,7 @@ public: } ).si_then([this, FNAME, &t] { SUBINFOT(seastore_cache, - "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}", + "finish with {}(0x{:x}B) extents, {} dirty, dirty_from={}, alloc_from={}", t, extents_index.size(), extents_index.get_bytes(), @@ -1435,11 +1474,10 @@ private: /// Update lru for access to ref void touch_extent( CachedExtent &ext, - const Transaction::src_t* p_src) + const Transaction::src_t* p_src, + cache_hint_t hint) { - if (p_src && - is_background_transaction(*p_src) && - is_logical_type(ext.get_type())) { + if (hint == CACHE_HINT_NOCACHE && is_logical_type(ext.get_type())) { return; } if (ext.is_stable_clean() && !ext.is_placeholder()) { @@ -1530,22 +1568,29 @@ private: assert(extent.is_stable_clean() && !extent.is_placeholder()); assert(extent.primary_ref_list_hook.is_linked()); assert(lru.size() > 0); - auto extent_length = extent.get_length(); - assert(current_size >= extent_length); + auto extent_loaded_length = extent.get_loaded_length(); + assert(current_size >= extent_loaded_length); lru.erase(lru.s_iterator_to(extent)); - current_size -= extent_length; - get_by_ext(sizes_by_ext, extent.get_type()).account_out(extent_length); - overall_io.out_sizes.account_in(extent_length); + current_size -= extent_loaded_length; + get_by_ext(sizes_by_ext, extent.get_type()).account_out(extent_loaded_length); + overall_io.out_sizes.account_in(extent_loaded_length); if (p_src) { get_by_ext( get_by_src(trans_io_by_src_ext, *p_src), extent.get_type() - ).out_sizes.account_in(extent_length); + ).out_sizes.account_in(extent_loaded_length); } intrusive_ptr_release(&extent); } + void trim_to_capacity( + const Transaction::src_t* p_src) { + while (current_size > capacity) { + do_remove_from_lru(lru.front(), p_src); + } + } + public: LRU(size_t capacity) : capacity(capacity) {} @@ -1579,31 +1624,55 @@ private: const Transaction::src_t* p_src) { assert(extent.is_stable_clean() && !extent.is_placeholder()); - auto extent_length = extent.get_length(); + auto extent_loaded_length = extent.get_loaded_length(); if (extent.primary_ref_list_hook.is_linked()) { // present, move to top (back) assert(lru.size() > 0); - assert(current_size >= extent_length); + assert(current_size >= extent_loaded_length); lru.erase(lru.s_iterator_to(extent)); lru.push_back(extent); } else { // absent, add to top (back) - current_size += extent_length; - get_by_ext(sizes_by_ext, extent.get_type()).account_in(extent_length); - overall_io.in_sizes.account_in(extent_length); + if (extent_loaded_length > 0) { + current_size += extent_loaded_length; + get_by_ext(sizes_by_ext, extent.get_type()).account_in(extent_loaded_length); + overall_io.in_sizes.account_in(extent_loaded_length); + if (p_src) { + get_by_ext( + get_by_src(trans_io_by_src_ext, *p_src), + extent.get_type() + ).in_sizes.account_in(extent_loaded_length); + } + } // else: the extent isn't loaded upon touch_extent()/on_cache(), + // account the io later in increase_cached_size() upon read_extent() + intrusive_ptr_add_ref(&extent); + lru.push_back(extent); + + trim_to_capacity(p_src); + } + } + + void increase_cached_size( + CachedExtent &extent, + extent_len_t increased_length, + const Transaction::src_t* p_src) { + assert(!extent.is_mutable()); + + if (extent.primary_ref_list_hook.is_linked()) { + assert(extent.is_stable_clean() && !extent.is_placeholder()); + // present, increase size + assert(lru.size() > 0); + current_size += increased_length; + get_by_ext(sizes_by_ext, extent.get_type()).account_in(increased_length); + overall_io.in_sizes.account_in(increased_length); if (p_src) { get_by_ext( get_by_src(trans_io_by_src_ext, *p_src), extent.get_type() - ).in_sizes.account_in(extent_length); + ).in_sizes.account_in(increased_length); } - intrusive_ptr_add_ref(&extent); - lru.push_back(extent); - // trim to capacity - while (current_size > capacity) { - do_remove_from_lru(lru.front(), p_src); - } + trim_to_capacity(nullptr); } } @@ -1758,18 +1827,23 @@ private: seastar::metrics::metric_group metrics; void register_metrics(); - /// alloc buffer for cached extent - bufferptr alloc_cache_buf(size_t size) { - // TODO: memory pooling etc - auto bp = ceph::bufferptr( - buffer::create_page_aligned(size)); - bp.zero(); - return bp; + void apply_backref_mset( + backref_entry_refs_t& backref_entries) { + for (auto& entry : backref_entries) { + backref_entry_mset.insert(*entry); + } } - void backref_batch_update( - std::vector<backref_entry_ref> &&, - const journal_seq_t &); + void apply_backref_byseq( + backref_entry_refs_t&& backref_entries, + const journal_seq_t& seq); + + void commit_backref_entries( + backref_entry_refs_t&& backref_entries, + const journal_seq_t& seq) { + apply_backref_mset(backref_entries); + apply_backref_byseq(std::move(backref_entries), seq); + } /// Add extent to extents handling dirty and refcounting /// @@ -1819,39 +1893,74 @@ private: /// Introspect transaction when it is being destructed void on_transaction_destruct(Transaction& t); + /// Read the extent in range offset~length, + /// must be called exclusively for an extent, + /// also see do_read_extent_maybe_partial(). + /// + /// May return an invalid extent due to transaction conflict. template <typename T> read_extent_ret<T> read_extent( - TCachedExtentRef<T>&& extent + TCachedExtentRef<T>&& extent, + extent_len_t offset, + extent_len_t length, + const Transaction::src_t* p_src ) { + LOG_PREFIX(Cache::read_extent); assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING || - extent->state == CachedExtent::extent_state_t::EXIST_CLEAN || - extent->state == CachedExtent::extent_state_t::CLEAN); + extent->state == CachedExtent::extent_state_t::EXIST_CLEAN || + extent->state == CachedExtent::extent_state_t::CLEAN); + assert(!extent->is_range_loaded(offset, length)); + assert(is_aligned(offset, get_block_size())); + assert(is_aligned(length, get_block_size())); extent->set_io_wait(); - return epm.read( - extent->get_paddr(), - extent->get_length(), - extent->get_bptr() - ).safe_then( - [extent=std::move(extent), this]() mutable { - LOG_PREFIX(Cache::read_extent); - if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) { - extent->state = CachedExtent::extent_state_t::CLEAN; - } - ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN - || extent->state == CachedExtent::extent_state_t::CLEAN - || !extent->is_valid()); - if (extent->is_valid()) { - // crc will be checked against LBA leaf entry for logical extents, - // or check against in-extent crc for physical extents. - if (epm.get_checksum_needed(extent->get_paddr())) { - extent->last_committed_crc = extent->calc_crc32c(); - } else { - extent->last_committed_crc = CRC_NULL; - } - extent->on_clean_read(); - } + auto old_length = extent->get_loaded_length(); + load_ranges_t to_read = extent->load_ranges(offset, length); + auto new_length = extent->get_loaded_length(); + assert(new_length > old_length); + lru.increase_cached_size(*extent, new_length - old_length, p_src); + return seastar::do_with(to_read.ranges, [extent, this, FNAME](auto &read_ranges) { + return ExtentPlacementManager::read_ertr::parallel_for_each( + read_ranges, [extent, this, FNAME](auto &read_range) { + SUBDEBUG(seastore_cache, "reading extent {} 0x{:x}~0x{:x} ...", + extent->get_paddr(), read_range.offset, read_range.get_length()); + assert(is_aligned(read_range.offset, get_block_size())); + assert(is_aligned(read_range.get_length(), get_block_size())); + return epm.read( + extent->get_paddr() + read_range.offset, + read_range.get_length(), + read_range.ptr); + }); + }).safe_then( + [this, FNAME, extent=std::move(extent), offset, length]() mutable { + if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) { + extent->state = CachedExtent::extent_state_t::CLEAN; + } + ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN + || extent->state == CachedExtent::extent_state_t::CLEAN + || !extent->is_valid()); + if (extent->is_valid()) { + if (extent->is_fully_loaded()) { + // crc will be checked against LBA leaf entry for logical extents, + // or check against in-extent crc for physical extents. + if (epm.get_checksum_needed(extent->get_paddr())) { + extent->last_committed_crc = extent->calc_crc32c(); + } else { + extent->last_committed_crc = CRC_NULL; + } + // on_clean_read() may change the content, call after calc_crc32c() + extent->on_clean_read(); + SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done -- {}", + offset, length, *extent); + } else { + extent->last_committed_crc = CRC_NULL; + SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (partial) -- {}", + offset, length, *extent); + } + } else { + SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (invalidated) -- {}", + offset, length, *extent); + } extent->complete_io(); - SUBDEBUG(seastore_cache, "read extent done -- {}", *extent); return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>( std::move(extent)); }, diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 76c18bde667..49fede1d9a8 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -7,6 +7,7 @@ #include "crimson/common/log.h" #include "crimson/os/seastore/btree/fixed_kv_node.h" +#include "crimson/os/seastore/lba_mapping.h" namespace { [[maybe_unused]] seastar::logger& logger() { @@ -38,12 +39,6 @@ void intrusive_ptr_release(CachedExtent *ptr) #endif -bool is_backref_mapped_extent_node(const CachedExtentRef &extent) { - return extent->is_logical() - || is_lba_node(extent->get_type()) - || extent->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL; -} - std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state) { switch (state) { @@ -94,15 +89,15 @@ CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) { } std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) { - return out << "parent_tracker=" << (void*)&tracker - << ", parent=" << (void*)tracker.get_parent().get(); + return out << "tracker_ptr=" << (void*)&tracker + << ", parent_ptr=" << (void*)tracker.get_parent().get(); } std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const { if (parent_tracker) { - out << *parent_tracker; + out << ", parent_tracker(" << *parent_tracker << ")"; } else { - out << ", parent_tracker=" << (void*)nullptr; + out << ", parent_tracker(nullptr)"; } _print_detail(out); return out; @@ -148,6 +143,12 @@ void LogicalCachedExtent::on_replace_prior() { parent->children[off] = this; } +void LogicalCachedExtent::maybe_set_intermediate_laddr(LBAMapping &mapping) { + laddr = mapping.is_indirect() + ? mapping.get_intermediate_base() + : mapping.get_key(); +} + parent_tracker_t::~parent_tracker_t() { // this is parent's tracker, reset it auto &p = (FixedKVNode<laddr_t>&)*parent; @@ -156,30 +157,183 @@ parent_tracker_t::~parent_tracker_t() { } } -std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) +bool BufferSpace::is_range_loaded(extent_len_t offset, extent_len_t length) const { - out << "LBAMapping(" << rhs.get_key() - << "~0x" << std::hex << rhs.get_length() << std::dec - << "->" << rhs.get_val(); - if (rhs.is_indirect()) { - out << ",indirect(" << rhs.get_intermediate_base() - << "~0x" << std::hex << rhs.get_intermediate_length() - << "@0x" << rhs.get_intermediate_offset() << std::dec - << ")"; + assert(length > 0); + auto i = buffer_map.upper_bound(offset); + if (i == buffer_map.begin()) { + return false; } - out << ")"; - return out; + --i; + auto& [i_offset, i_bl] = *i; + assert(offset >= i_offset); + assert(i_bl.length() > 0); + if (offset + length > i_offset + i_bl.length()) { + return false; + } else { + return true; + } +} + +ceph::bufferlist BufferSpace::get_buffer(extent_len_t offset, extent_len_t length) const +{ + assert(length > 0); + auto i = buffer_map.upper_bound(offset); + assert(i != buffer_map.begin()); + --i; + auto& [i_offset, i_bl] = *i; + assert(offset >= i_offset); + assert(i_bl.length() > 0); + assert(offset + length <= i_offset + i_bl.length()); + ceph::bufferlist res; + res.substr_of(i_bl, offset - i_offset, length); + return res; +} + +load_ranges_t BufferSpace::load_ranges(extent_len_t offset, extent_len_t length) +{ + assert(length > 0); + load_ranges_t ret; + auto next = buffer_map.upper_bound(offset); + + // must be assigned for the main-loop + map_t::iterator previous; + extent_len_t range_offset; + extent_len_t range_length; + + // returns whether to proceed main-loop or not + auto f_merge_next_check_hole = [this, &next, &range_offset, &range_length]( + ceph::bufferlist& previous_bl, + extent_len_t hole_length, + extent_len_t next_offset, + const ceph::bufferlist& next_bl) { + range_length -= hole_length; + previous_bl.append(next_bl); + if (range_length <= next_bl.length()) { + // "next" end includes or beyonds the range + buffer_map.erase(next); + return false; + } else { + range_offset = next_offset + next_bl.length(); + range_length -= next_bl.length(); + // erase next should destruct next_bl + next = buffer_map.erase(next); + return true; + } + }; + + // returns whether to proceed main-loop or not + auto f_prepare_without_merge_previous = [ + this, offset, length, + &ret, &previous, &next, &range_length, + &f_merge_next_check_hole]() { + if (next == buffer_map.end()) { + // "next" reaches end, + // range has no "next" to merge + create_hole_insert_map(ret, offset, length, next); + return false; + } + // "next" is valid + auto& [n_offset, n_bl] = *next; + // next is from upper_bound() + assert(offset < n_offset); + extent_len_t hole_length = n_offset - offset; + if (length < hole_length) { + // "next" is beyond the range end, + // range has no "next" to merge + create_hole_insert_map(ret, offset, length, next); + return false; + } + // length >= hole_length + // insert hole as "previous" + previous = create_hole_insert_map(ret, offset, hole_length, next); + auto& p_bl = previous->second; + range_length = length; + return f_merge_next_check_hole(p_bl, hole_length, n_offset, n_bl); + }; + + /* + * prepare main-loop + */ + if (next == buffer_map.begin()) { + // "previous" is invalid + if (!f_prepare_without_merge_previous()) { + return ret; + } + } else { + // "previous" is valid + previous = std::prev(next); + auto& [p_offset, p_bl] = *previous; + assert(offset >= p_offset); + extent_len_t p_end = p_offset + p_bl.length(); + if (offset <= p_end) { + // "previous" is adjacent or overlaps the range + range_offset = p_end; + assert(offset + length > p_end); + range_length = offset + length - p_end; + // start the main-loop (merge "previous") + } else { + // "previous" is not adjacent to the range + // range and buffer_map should not overlap + assert(offset > p_end); + if (!f_prepare_without_merge_previous()) { + return ret; + } + } + } + + /* + * main-loop: merge the range with "previous" and look at "next" + * + * "previous": the previous buffer_map entry, must be valid, must be mergable + * "next": the next buffer_map entry, maybe end, maybe mergable + * range_offset/length: the current range right after "previous" + */ + assert(std::next(previous) == next); + auto& [p_offset, p_bl] = *previous; + assert(range_offset == p_offset + p_bl.length()); + assert(range_length > 0); + while (next != buffer_map.end()) { + auto& [n_offset, n_bl] = *next; + assert(range_offset < n_offset); + extent_len_t hole_length = n_offset - range_offset; + if (range_length < hole_length) { + // "next" offset is beyond the range end + break; + } + // range_length >= hole_length + create_hole_append_bl(ret, p_bl, range_offset, hole_length); + if (!f_merge_next_check_hole(p_bl, hole_length, n_offset, n_bl)) { + return ret; + } + assert(std::next(previous) == next); + assert(range_offset == p_offset + p_bl.length()); + assert(range_length > 0); + } + // range has no "next" to merge: + // 1. "next" reaches end + // 2. "next" offset is beyond the range end + create_hole_append_bl(ret, p_bl, range_offset, range_length); + return ret; } -std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) +ceph::bufferptr BufferSpace::to_full_ptr(extent_len_t length) { - bool first = true; - out << '['; - for (const auto &i: rhs) { - out << (first ? "" : ",") << *i; - first = false; + assert(length > 0); + assert(buffer_map.size() == 1); + auto it = buffer_map.begin(); + auto& [i_off, i_buf] = *it; + assert(i_off == 0); + if (!i_buf.is_contiguous()) { + // Allocate page aligned ptr, also see create_extent_ptr_*() + i_buf.rebuild(); } - return out << ']'; + assert(i_buf.get_num_buffers() == 1); + ceph::bufferptr ptr(i_buf.front()); + assert(ptr.is_page_aligned()); + assert(ptr.length() == length); + buffer_map.clear(); + return ptr; } } diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 6025725aa33..9dc60d719eb 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -6,15 +6,15 @@ #include <iostream> #include <boost/intrusive/list.hpp> +#include <boost/intrusive/set.hpp> #include <boost/intrusive_ptr.hpp> #include <boost/smart_ptr/intrusive_ref_counter.hpp> #include "seastar/core/shared_future.hh" #include "include/buffer.h" -#include "crimson/common/errorator.h" -#include "crimson/common/interruptible_future.h" #include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_interruptor.h" struct btree_lba_manager_test; struct lba_btree_test; @@ -23,7 +23,6 @@ struct cache_test_t; namespace crimson::os::seastore { -class Transaction; class CachedExtent; using CachedExtentRef = boost::intrusive_ptr<CachedExtent>; class SegmentedAllocator; @@ -41,6 +40,20 @@ void intrusive_ptr_release(CachedExtent *); #endif +// Note: BufferSpace::to_full_ptr() also creates extent ptr. + +inline ceph::bufferptr create_extent_ptr_rand(extent_len_t len) { + assert(is_aligned(len, CEPH_PAGE_SIZE)); + assert(len > 0); + return ceph::bufferptr(buffer::create_page_aligned(len)); +} + +inline ceph::bufferptr create_extent_ptr_zero(extent_len_t len) { + auto bp = create_extent_ptr_rand(len); + bp.zero(); + return bp; +} + template <typename T> using TCachedExtentRef = boost::intrusive_ptr<T>; @@ -155,6 +168,85 @@ struct trans_spec_view_t { boost::intrusive::compare<cmp_t>>; }; +struct load_range_t { + extent_len_t offset; + ceph::bufferptr ptr; + + extent_len_t get_length() const { + return ptr.length(); + } + + extent_len_t get_end() const { + extent_len_t end = offset + ptr.length(); + assert(end > offset); + return end; + } +}; +struct load_ranges_t { + extent_len_t length = 0; + std::list<load_range_t> ranges; + + void push_back(extent_len_t offset, ceph::bufferptr ptr) { + assert(ranges.empty() || + (ranges.back().get_end() < offset)); + assert(ptr.length()); + length += ptr.length(); + ranges.push_back({offset, std::move(ptr)}); + } +}; + +/// manage small chunks of extent +class BufferSpace { + using map_t = std::map<extent_len_t, ceph::bufferlist>; +public: + BufferSpace() = default; + + /// Returns true if offset~length is fully loaded + bool is_range_loaded(extent_len_t offset, extent_len_t length) const; + + /// Returns the bufferlist of offset~length + ceph::bufferlist get_buffer(extent_len_t offset, extent_len_t length) const; + + /// Returns the ranges to load, merge the buffer_map if possible + load_ranges_t load_ranges(extent_len_t offset, extent_len_t length); + + /// Converts to ptr when fully loaded + ceph::bufferptr to_full_ptr(extent_len_t length); + +private: + // create and append the read-hole to + // load_ranges_t and bl + static void create_hole_append_bl( + load_ranges_t& ret, + ceph::bufferlist& bl, + extent_len_t hole_offset, + extent_len_t hole_length) { + ceph::bufferptr hole_ptr = create_extent_ptr_rand(hole_length); + bl.append(hole_ptr); + ret.push_back(hole_offset, std::move(hole_ptr)); + } + + // create and insert the read-hole to buffer_map, + // and append to load_ranges_t + // returns the iterator containing the inserted read-hole + auto create_hole_insert_map( + load_ranges_t& ret, + extent_len_t hole_offset, + extent_len_t hole_length, + const map_t::const_iterator& next_it) { + assert(!buffer_map.contains(hole_offset)); + ceph::bufferlist bl; + create_hole_append_bl(ret, bl, hole_offset, hole_length); + auto it = buffer_map.insert( + next_it, std::pair{hole_offset, std::move(bl)}); + assert(next_it == std::next(it)); + return it; + } + + /// extent offset -> buffer, won't overlap nor contiguous + map_t buffer_map; +}; + class ExtentIndex; class CachedExtent : public boost::intrusive_ref_counter< @@ -256,6 +348,17 @@ public: virtual void on_initial_write() {} /** + * on_fully_loaded + * + * Called when ptr is ready. Normally this should be used to initiate + * the extent to be identical to CachedExtent(ptr). + * + * Note this doesn't mean the content is fully read, use on_clean_read for + * this purpose. + */ + virtual void on_fully_loaded() {} + + /** * on_clean_read * * Called after read of initially written extent. @@ -350,12 +453,12 @@ public: << ", modify_time=" << sea_time_point_printer_t{modify_time} << ", paddr=" << get_paddr() << ", prior_paddr=" << prior_poffset_str - << std::hex << ", length=0x" << get_length() << std::dec + << std::hex << ", length=0x" << get_length() + << ", loaded=0x" << get_loaded_length() << std::dec << ", state=" << state << ", last_committed_crc=" << last_committed_crc << ", refcount=" << use_count() << ", user_hint=" << user_hint - << ", fully_loaded=" << is_fully_loaded() << ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation}; if (state != extent_state_t::INVALID && state != extent_state_t::CLEAN_PENDING) { @@ -537,7 +640,40 @@ public: /// Return true if extent is fully loaded or is about to be fully loaded (call /// wait_io() in this case) bool is_fully_loaded() const { - return ptr.has_value(); + if (ptr.has_value()) { + // length == 0 iff root + assert(length == loaded_length); + assert(!buffer_space.has_value()); + return true; + } else { // ptr is std::nullopt + assert(length > loaded_length); + assert(buffer_space.has_value()); + return false; + } + } + + /// Return true if range offset~_length is loaded + bool is_range_loaded(extent_len_t offset, extent_len_t _length) { + assert(is_aligned(offset, CEPH_PAGE_SIZE)); + assert(is_aligned(_length, CEPH_PAGE_SIZE)); + assert(_length > 0); + assert(offset + _length <= length); + if (is_fully_loaded()) { + return true; + } + return buffer_space->is_range_loaded(offset, _length); + } + + /// Get buffer by given offset and _length. + ceph::bufferlist get_range(extent_len_t offset, extent_len_t _length) { + assert(is_range_loaded(offset, _length)); + ceph::bufferlist res; + if (is_fully_loaded()) { + res.append(ceph::bufferptr(get_bptr(), offset, _length)); + } else { + res = buffer_space->get_buffer(offset, _length); + } + return res; } /** @@ -553,12 +689,9 @@ public: return length; } + /// Returns length of partially loaded extent data in cache extent_len_t get_loaded_length() const { - if (ptr.has_value()) { - return ptr->length(); - } else { - return 0; - } + return loaded_length; } /// Returns version, get_version() == 0 iff is_clean() @@ -697,12 +830,19 @@ private: */ journal_seq_t dirty_from_or_retired_at; - /// cache data contents, std::nullopt if no data in cache + /// cache data contents, std::nullopt iff partially loaded std::optional<ceph::bufferptr> ptr; - /// disk data length + /// disk data length, 0 iff root extent_len_t length; + /// loaded data length, <length iff partially loaded + extent_len_t loaded_length; + + /// manager of buffer pieces for ObjectDataBLock + /// valid iff partially loaded + std::optional<BufferSpace> buffer_space; + /// number of deltas since initial write extent_version_t version = 0; @@ -748,9 +888,29 @@ protected: trans_view_set_t retired_transactions; CachedExtent(CachedExtent &&other) = delete; - CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) { - length = ptr->length(); + + /// construct a fully loaded CachedExtent + explicit CachedExtent(ceph::bufferptr &&_ptr) + : length(_ptr.length()), + loaded_length(_ptr.length()) { + ptr = std::move(_ptr); + + assert(ptr->is_page_aligned()); + assert(length > 0); + assert(is_fully_loaded()); + // must call init() to fully initialize + } + + /// construct a partially loaded CachedExtent + /// must be identical with CachedExtent(ptr) after on_fully_loaded() + explicit CachedExtent(extent_len_t _length) + : length(_length), + loaded_length(0), + buffer_space(std::in_place) { + assert(is_aligned(length, CEPH_PAGE_SIZE)); assert(length > 0); + assert(!is_fully_loaded()); + // must call init() to fully initialize } /// construct new CachedExtent, will deep copy the buffer @@ -758,16 +918,20 @@ protected: : state(other.state), dirty_from_or_retired_at(other.dirty_from_or_retired_at), length(other.get_length()), + loaded_length(other.get_loaded_length()), version(other.version), poffset(other.poffset) { - assert((length % CEPH_PAGE_SIZE) == 0); - if (other.is_fully_loaded()) { - ptr.emplace(buffer::create_page_aligned(length)); - other.ptr->copy_out(0, length, ptr->c_str()); - } else { - // the extent must be fully loaded before CoW - assert(length == 0); // in case of root - } + // the extent must be fully loaded before CoW + assert(other.is_fully_loaded()); + assert(is_aligned(length, CEPH_PAGE_SIZE)); + if (length > 0) { + ptr = create_extent_ptr_rand(length); + other.ptr->copy_out(0, length, ptr->c_str()); + } else { // length == 0, must be root + ptr = ceph::bufferptr(0); + } + + assert(is_fully_loaded()); } struct share_buffer_t {}; @@ -777,23 +941,35 @@ protected: dirty_from_or_retired_at(other.dirty_from_or_retired_at), ptr(other.ptr), length(other.get_length()), + loaded_length(other.get_loaded_length()), version(other.version), - poffset(other.poffset) {} + poffset(other.poffset) { + // the extent must be fully loaded before CoW + assert(other.is_fully_loaded()); + assert(is_aligned(length, CEPH_PAGE_SIZE)); + assert(length > 0); + assert(is_fully_loaded()); + } // 0 length is only possible for the RootBlock - struct zero_length_t {}; - CachedExtent(zero_length_t) : ptr(ceph::bufferptr(0)), length(0) {}; - - struct retired_placeholder_t{}; - CachedExtent(retired_placeholder_t, extent_len_t _length) - : state(extent_state_t::CLEAN), - length(_length) { - assert(length > 0); + struct root_construct_t {}; + CachedExtent(root_construct_t) + : ptr(ceph::bufferptr(0)), + length(0), + loaded_length(0) { + assert(is_fully_loaded()); + // must call init() to fully initialize } - /// no buffer extent, for lazy read - CachedExtent(extent_len_t _length) : length(_length) { - assert(length > 0); + struct retired_placeholder_construct_t {}; + CachedExtent(retired_placeholder_construct_t, extent_len_t _length) + : state(extent_state_t::CLEAN), + length(_length), + loaded_length(0), + buffer_space(std::in_place) { + assert(!is_fully_loaded()); + assert(is_aligned(length, CEPH_PAGE_SIZE)); + // must call init() to fully initialize } friend class Cache; @@ -804,9 +980,8 @@ protected: } template <typename T> - static TCachedExtentRef<T> make_placeholder_cached_extent_ref( - extent_len_t length) { - return new T(length); + static TCachedExtentRef<T> make_cached_extent_ref() { + return new T(); } void reset_prior_instance() { @@ -869,6 +1044,45 @@ protected: } } + /// Returns the ranges to load, convert to fully loaded is possible + load_ranges_t load_ranges(extent_len_t offset, extent_len_t _length) { + assert(is_aligned(offset, CEPH_PAGE_SIZE)); + assert(is_aligned(_length, CEPH_PAGE_SIZE)); + assert(_length > 0); + assert(offset + _length <= length); + assert(!is_fully_loaded()); + + if (loaded_length == 0 && _length == length) { + assert(offset == 0); + // skip rebuilding the buffer from buffer_space + ptr = create_extent_ptr_rand(length); + loaded_length = _length; + buffer_space.reset(); + assert(is_fully_loaded()); + on_fully_loaded(); + load_ranges_t ret; + ret.push_back(offset, *ptr); + return ret; + } + + load_ranges_t ret = buffer_space->load_ranges(offset, _length); + loaded_length += ret.length; + assert(length >= loaded_length); + if (length == loaded_length) { + // convert to fully loaded + ptr = buffer_space->to_full_ptr(length); + buffer_space.reset(); + assert(is_fully_loaded()); + on_fully_loaded(); + // adjust ret since the ptr has been rebuild + for (load_range_t& range : ret.ranges) { + auto range_length = range.ptr.length(); + range.ptr = ceph::bufferptr(*ptr, range.offset, range_length); + } + } + return ret; + } + friend class crimson::os::seastore::SegmentedAllocator; friend class crimson::os::seastore::TransactionManager; friend class crimson::os::seastore::ExtentPlacementManager; @@ -883,8 +1097,6 @@ protected: std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); std::ostream &operator<<(std::ostream &, const CachedExtent&); -bool is_backref_mapped_extent_node(const CachedExtentRef &extent); - /// Compare extents by paddr struct paddr_cmp { bool operator()(paddr_t lhs, const CachedExtent &rhs) const { @@ -1067,7 +1279,6 @@ private: }; class ChildableCachedExtent; -class LogicalCachedExtent; class child_pos_t { public: @@ -1088,14 +1299,17 @@ private: uint16_t pos = std::numeric_limits<uint16_t>::max(); }; -using get_child_ertr = crimson::errorator< - crimson::ct_error::input_output_error>; +using get_child_iertr = trans_iertr<crimson::errorator< + crimson::ct_error::input_output_error>>; +template <typename T> +using get_child_ifut = get_child_iertr::future<TCachedExtentRef<T>>; + template <typename T> struct get_child_ret_t { - std::variant<child_pos_t, get_child_ertr::future<TCachedExtentRef<T>>> ret; + std::variant<child_pos_t, get_child_ifut<T>> ret; get_child_ret_t(child_pos_t pos) : ret(std::move(pos)) {} - get_child_ret_t(get_child_ertr::future<TCachedExtentRef<T>> child) + get_child_ret_t(get_child_ifut<T> child) : ret(std::move(child)) {} bool has_child() const { @@ -1107,7 +1321,7 @@ struct get_child_ret_t { return std::get<0>(ret); } - get_child_ertr::future<TCachedExtentRef<T>> &get_child_fut() { + get_child_ifut<T> &get_child_fut() { ceph_assert(ret.index() == 1); return std::get<1>(ret); } @@ -1122,48 +1336,18 @@ using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t> template <typename key_t, typename val_t> class PhysicalNodeMapping { public: + PhysicalNodeMapping() = default; + PhysicalNodeMapping(const PhysicalNodeMapping&) = delete; virtual extent_len_t get_length() const = 0; - virtual extent_types_t get_type() const = 0; virtual val_t get_val() const = 0; virtual key_t get_key() const = 0; - virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0; - virtual PhysicalNodeMappingRef<key_t, val_t> refresh_with_pending_parent() { - ceph_abort("impossible"); - return {}; - } virtual bool has_been_invalidated() const = 0; virtual CachedExtentRef get_parent() const = 0; virtual uint16_t get_pos() const = 0; - // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h - virtual bool is_indirect() const { return false; } - virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; } - virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; } - virtual extent_len_t get_intermediate_length() const { return 0; } virtual uint32_t get_checksum() const { ceph_abort("impossible"); return 0; } - // The start offset of the pin, must be 0 if the pin is not indirect - virtual extent_len_t get_intermediate_offset() const { - return std::numeric_limits<extent_len_t>::max(); - } - - virtual get_child_ret_t<LogicalCachedExtent> - get_logical_extent(Transaction &t) = 0; - - void link_child(ChildableCachedExtent *c) { - ceph_assert(child_pos); - child_pos->link_child(c); - } - - // For reserved mappings, the return values are - // undefined although it won't crash - virtual bool is_stable() const = 0; - virtual bool is_data_stable() const = 0; - virtual bool is_clone() const = 0; - bool is_zero_reserved() const { - return !get_val().is_real(); - } virtual bool is_parent_viewable() const = 0; virtual bool is_parent_valid() const = 0; virtual bool parent_modified() const { @@ -1176,24 +1360,8 @@ public: } virtual ~PhysicalNodeMapping() {} -protected: - std::optional<child_pos_t> child_pos = std::nullopt; }; -using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>; -using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>; - -std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs); - -using lba_pin_list_t = std::list<LBAMappingRef>; - -std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); - -using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>; -using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>; - -using backref_pin_list_t = std::list<BackrefMappingRef>; - /** * RetiredExtentPlaceholder * @@ -1209,7 +1377,7 @@ class RetiredExtentPlaceholder : public CachedExtent { public: RetiredExtentPlaceholder(extent_len_t length) - : CachedExtent(CachedExtent::retired_placeholder_t{}, length) {} + : CachedExtent(CachedExtent::retired_placeholder_construct_t{}, length) {} CachedExtentRef duplicate_for_write(Transaction&) final { ceph_assert(0 == "Should never happen for a placeholder"); @@ -1307,6 +1475,8 @@ private: return out; } }; + +class LBAMapping; /** * LogicalCachedExtent * @@ -1341,11 +1511,7 @@ public: laddr = nladdr; } - void maybe_set_intermediate_laddr(LBAMapping &mapping) { - laddr = mapping.is_indirect() - ? mapping.get_intermediate_base() - : mapping.get_key(); - } + void maybe_set_intermediate_laddr(LBAMapping &mapping); void apply_delta_and_adjust_crc( paddr_t base, const ceph::bufferlist &bl) final { @@ -1445,8 +1611,6 @@ using lextent_list_t = addr_extent_list_base_t< } #if FMT_VERSION >= 90000 -template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {}; template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {}; template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {}; -template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {}; #endif diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.h b/src/crimson/os/seastore/collection_manager/collection_flat_node.h index aa1e7135613..1f4de652bba 100644 --- a/src/crimson/os/seastore/collection_manager/collection_flat_node.h +++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.h @@ -96,6 +96,8 @@ struct CollectionNode explicit CollectionNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {} + explicit CollectionNode(extent_len_t length) + : LogicalCachedExtent(length) {} explicit CollectionNode(const CollectionNode &other) : LogicalCachedExtent(other), decoded(other.decoded) {} diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc index c32dc66619a..866b5bf350c 100644 --- a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc +++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc @@ -51,8 +51,11 @@ FlatCollectionManager::get_coll_root(const coll_root_t &coll_root, Transaction & cc.t, coll_root.get_location(), coll_root.get_size() - ).si_then([](auto&& e) { - return get_root_iertr::make_ready_future<CollectionNodeRef>(std::move(e)); + ).si_then([](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + return get_root_iertr::make_ready_future<CollectionNodeRef>( + std::move(maybe_indirect_extent.extent)); }); } diff --git a/src/crimson/os/seastore/device.cc b/src/crimson/os/seastore/device.cc index c3bda82a7f6..cc83eb54826 100644 --- a/src/crimson/os/seastore/device.cc +++ b/src/crimson/os/seastore/device.cc @@ -12,7 +12,7 @@ namespace crimson::os::seastore { std::ostream& operator<<(std::ostream& out, const device_spec_t& ds) { return out << "device_spec(" - << "magic=" << ds.magic + << "magic=0x" << std::hex << ds.magic << std::dec << ", dtype=" << ds.dtype << ", " << device_id_printer_t{ds.id} << ")"; diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc index 0458fbfed74..fd19eeb7e58 100644 --- a/src/crimson/os/seastore/extent_placement_manager.cc +++ b/src/crimson/os/seastore/extent_placement_manager.cc @@ -1069,8 +1069,8 @@ RandomBlockOolWriter::do_write( w_info.bp = bp; writes.push_back(w_info); } - TRACE("current extent: base off {} len {},\ - maybe-merged current extent: base off {} len {}", + TRACE("current extent: {}~0x{:x},\ + maybe-merged current extent: {}~0x{:x}", paddr, ex->get_length(), writes.back().offset, writes.back().bp.length()); } diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h index c4e98a5f4a1..4ff9729c5f4 100644 --- a/src/crimson/os/seastore/extent_placement_manager.h +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -236,9 +236,9 @@ struct io_usage_t { cleaner_usage_t cleaner_usage; friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) { return out << "io_usage_t(" - << "inline_usage=" << usage.inline_usage - << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage - << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage + << "inline_usage=0x" << std::hex << usage.inline_usage + << ", main_cleaner_usage=0x" << usage.cleaner_usage.main_usage + << ", cold_cleaner_usage=0x" << usage.cleaner_usage.cold_ool_usage << std::dec << ")"; } }; @@ -371,9 +371,7 @@ public: // XXX: bp might be extended to point to different memory (e.g. PMem) // according to the allocator. - auto bp = ceph::bufferptr( - buffer::create_page_aligned(length)); - bp.zero(); + auto bp = create_extent_ptr_zero(length); return alloc_result_t{addr, std::move(bp), gen}; } @@ -405,9 +403,7 @@ public: #ifdef UNIT_TESTS_BUILT if (unlikely(external_paddr.has_value())) { assert(external_paddr->is_fake()); - auto bp = ceph::bufferptr( - buffer::create_page_aligned(length)); - bp.zero(); + auto bp = create_extent_ptr_zero(length); allocs.emplace_back(alloc_result_t{*external_paddr, std::move(bp), gen}); } else { #else @@ -418,15 +414,17 @@ public: for (auto &ext : addrs) { auto left = ext.len; while (left > 0) { - auto len = std::min(max_data_allocation_size, left); - auto bp = ceph::bufferptr(buffer::create_page_aligned(len)); - bp.zero(); + auto len = left; + if (max_data_allocation_size) { + len = std::min(max_data_allocation_size, len); + } + auto bp = create_extent_ptr_zero(len); auto start = ext.start.is_delayed() ? ext.start : ext.start + (ext.len - left); allocs.emplace_back(alloc_result_t{start, std::move(bp), gen}); SUBDEBUGT(seastore_epm, - "allocated {} {}B extent at {}, hint={}, gen={}", + "allocated {} 0x{:x}B extent at {}, hint={}, gen={}", t, type, len, start, hint, gen); left -= len; } diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc deleted file mode 100644 index b0dc1b8c8a8..00000000000 --- a/src/crimson/os/seastore/extentmap_manager.cc +++ /dev/null @@ -1,33 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#include <experimental/iterator> -#include <iostream> - -#include "crimson/os/seastore/transaction_manager.h" -#include "crimson/os/seastore/extentmap_manager.h" -#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h" -namespace crimson::os::seastore::extentmap_manager { - -ExtentMapManagerRef create_extentmap_manager( - TransactionManager &trans_manager) { - return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager)); -} - -} - -namespace crimson::os::seastore { - -std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs) -{ - return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length - << "->" << rhs.laddr << ")"; -} - -std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs) -{ - out << '['; - std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", ")); - return out << ']'; -} - -} diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h index a5c9029c43c..298935bd22e 100644 --- a/src/crimson/os/seastore/journal.h +++ b/src/crimson/os/seastore/journal.h @@ -59,13 +59,13 @@ public: crimson::ct_error::erange, crimson::ct_error::input_output_error >; - using submit_record_ret = submit_record_ertr::future< - record_locator_t - >; - virtual submit_record_ret submit_record( + using on_submission_func_t = std::function< + void(record_locator_t)>; + virtual submit_record_ertr::future<> submit_record( record_t &&record, - OrderingHandle &handle - ) = 0; + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) = 0; /** * flush @@ -101,9 +101,6 @@ public: virtual replay_ret replay( delta_handler_t &&delta_handler) = 0; - virtual seastar::future<> finish_commit( - transaction_type_t type) = 0; - virtual ~Journal() {} virtual backend_type_t get_type() = 0; diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc index 9ee8b1b997f..41ff8318aba 100644 --- a/src/crimson/os/seastore/journal/circular_bounded_journal.cc +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc @@ -58,35 +58,52 @@ CircularBoundedJournal::close_ertr::future<> CircularBoundedJournal::close() return record_submitter.close(); } -CircularBoundedJournal::submit_record_ret +CircularBoundedJournal::submit_record_ertr::future<> CircularBoundedJournal::submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) { LOG_PREFIX(CircularBoundedJournal::submit_record); DEBUG("H{} {} start ...", (void*)&handle, record); assert(write_pipeline); - return do_submit_record(std::move(record), handle); + return do_submit_record( + std::move(record), handle, std::move(on_submission) + ).safe_then([this, t_src] { + if (is_trim_transaction(t_src)) { + return update_journal_tail( + trimmer.get_dirty_tail(), + trimmer.get_alloc_tail()); + } else { + return seastar::now(); + } + }); } -CircularBoundedJournal::submit_record_ret +CircularBoundedJournal::submit_record_ertr::future<> CircularBoundedJournal::do_submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + on_submission_func_t &&on_submission) { LOG_PREFIX(CircularBoundedJournal::do_submit_record); if (!record_submitter.is_available()) { DEBUG("H{} wait ...", (void*)&handle); return record_submitter.wait_available( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } auto action = record_submitter.check_action(record.size); if (action == RecordSubmitter::action_t::ROLL) { return record_submitter.roll_segment( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } @@ -99,13 +116,16 @@ CircularBoundedJournal::do_submit_record( return handle.enter(write_pipeline->device_submission ).then([submit_fut=std::move(submit_ret.future)]() mutable { return std::move(submit_fut); - }).safe_then([FNAME, this, &handle](record_locator_t result) { + }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission) + ](record_locator_t result) mutable { return handle.enter(write_pipeline->finalize - ).then([FNAME, this, result, &handle] { + ).then([FNAME, this, result, &handle, + on_submission=std::move(on_submission)] { DEBUG("H{} finish with {}", (void*)&handle, result); auto new_committed_to = result.write_result.get_end_seq(); record_submitter.update_committed_to(new_committed_to); - return result; + std::invoke(on_submission, result); + return seastar::now(); }); }); } @@ -392,13 +412,4 @@ Journal::replay_ret CircularBoundedJournal::replay( }); } -seastar::future<> CircularBoundedJournal::finish_commit(transaction_type_t type) { - if (is_trim_transaction(type)) { - return update_journal_tail( - trimmer.get_dirty_tail(), - trimmer.get_alloc_tail()); - } - return seastar::now(); -} - } diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.h b/src/crimson/os/seastore/journal/circular_bounded_journal.h index 874bd8dc086..16278df6cfe 100644 --- a/src/crimson/os/seastore/journal/circular_bounded_journal.h +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.h @@ -80,9 +80,11 @@ public: return backend_type_t::RANDOM_BLOCK; } - submit_record_ret submit_record( + submit_record_ertr::future<> submit_record( record_t &&record, - OrderingHandle &handle + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission ) final; seastar::future<> flush( @@ -148,8 +150,6 @@ public: return cjs.get_records_start(); } - seastar::future<> finish_commit(transaction_type_t type) final; - using cbj_delta_handler_t = std::function< replay_ertr::future<bool>( const record_locator_t&, @@ -160,7 +160,10 @@ public: cbj_delta_handler_t &&delta_handler, journal_seq_t tail); - submit_record_ret do_submit_record(record_t &&record, OrderingHandle &handle); + submit_record_ertr::future<> do_submit_record( + record_t &&record, + OrderingHandle &handle, + on_submission_func_t &&on_submission); void try_read_rolled_header(scan_valid_records_cursor &cursor) { paddr_t addr = convert_abs_addr_to_paddr( diff --git a/src/crimson/os/seastore/journal/record_submitter.cc b/src/crimson/os/seastore/journal/record_submitter.cc index adf8251b8a7..4976eee96e7 100644 --- a/src/crimson/os/seastore/journal/record_submitter.cc +++ b/src/crimson/os/seastore/journal/record_submitter.cc @@ -24,7 +24,7 @@ RecordBatch::add_pending( LOG_PREFIX(RecordBatch::add_pending); auto new_size = get_encoded_length_after(record, block_size); auto dlength_offset = pending.size.dlength; - TRACE("{} batches={}, write_size={}, dlength_offset={} ...", + TRACE("{} batches={}, write_size=0x{:x}, dlength_offset=0x{:x} ...", name, pending.get_size() + 1, new_size.get_encoded_length(), @@ -144,7 +144,7 @@ RecordSubmitter::RecordSubmitter( batches(new RecordBatch[io_depth + 1]) { LOG_PREFIX(RecordSubmitter); - INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size={}, " + INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size=0x{:x}, " "preferred_fullness={}", get_name(), io_depth, batch_capacity, batch_flush_size, preferred_fullness); diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc index 11f3cc8fd31..5405662b91e 100644 --- a/src/crimson/os/seastore/journal/segment_allocator.cc +++ b/src/crimson/os/seastore/journal/segment_allocator.cc @@ -189,7 +189,7 @@ SegmentAllocator::write(ceph::bufferlist&& to_write) auto write_length = to_write.length(); auto write_start_offset = written_to; if (unlikely(LOCAL_LOGGER.is_enabled(seastar::log_level::trace))) { - TRACE("{} {}~{}", print_name, get_written_to(), write_length); + TRACE("{} {}~0x{:x}", print_name, get_written_to(), write_length); } assert(write_length > 0); assert((write_length % get_block_size()) == 0); @@ -250,7 +250,7 @@ SegmentAllocator::close_segment() close_seg_info.num_extents}; ceph::bufferlist bl; encode(tail, bl); - INFO("{} close segment {}, written_to={}", + INFO("{} close segment {}, written_to=0x{:x}", print_name, tail, written_to); diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc index eca45f113c2..67c0b3fb8ac 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.cc +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -368,25 +368,30 @@ seastar::future<> SegmentedJournal::flush(OrderingHandle &handle) }); } -SegmentedJournal::submit_record_ret +SegmentedJournal::submit_record_ertr::future<> SegmentedJournal::do_submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + on_submission_func_t &&on_submission) { LOG_PREFIX(SegmentedJournal::do_submit_record); if (!record_submitter.is_available()) { DEBUG("H{} wait ...", (void*)&handle); return record_submitter.wait_available( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } auto action = record_submitter.check_action(record.size); if (action == RecordSubmitter::action_t::ROLL) { DEBUG("H{} roll, unavailable ...", (void*)&handle); return record_submitter.roll_segment( - ).safe_then([this, record=std::move(record), &handle]() mutable { - return do_submit_record(std::move(record), handle); + ).safe_then([this, record=std::move(record), &handle, + on_submission=std::move(on_submission)]() mutable { + return do_submit_record( + std::move(record), handle, std::move(on_submission)); }); } else { // SUBMIT_FULL/NOT_FULL DEBUG("H{} submit {} ...", @@ -398,22 +403,27 @@ SegmentedJournal::do_submit_record( return handle.enter(write_pipeline->device_submission ).then([submit_fut=std::move(submit_ret.future)]() mutable { return std::move(submit_fut); - }).safe_then([FNAME, this, &handle](record_locator_t result) { + }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission) + ](record_locator_t result) mutable { return handle.enter(write_pipeline->finalize - ).then([FNAME, this, result, &handle] { + ).then([FNAME, this, result, &handle, + on_submission=std::move(on_submission)] { DEBUG("H{} finish with {}", (void*)&handle, result); auto new_committed_to = result.write_result.get_end_seq(); record_submitter.update_committed_to(new_committed_to); - return result; + std::invoke(on_submission, result); + return seastar::now(); }); }); } } -SegmentedJournal::submit_record_ret +SegmentedJournal::submit_record_ertr::future<> SegmentedJournal::submit_record( record_t &&record, - OrderingHandle &handle) + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) { LOG_PREFIX(SegmentedJournal::submit_record); DEBUG("H{} {} start ...", (void*)&handle, record); @@ -424,12 +434,13 @@ SegmentedJournal::submit_record( ).get_encoded_length(); auto max_record_length = journal_segment_allocator.get_max_write_length(); if (expected_size > max_record_length) { - ERROR("H{} {} exceeds max record size {}", + ERROR("H{} {} exceeds max record size 0x{:x}", (void*)&handle, record, max_record_length); return crimson::ct_error::erange::make(); } - return do_submit_record(std::move(record), handle); + return do_submit_record( + std::move(record), handle, std::move(on_submission)); } } diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h index 891de7ec306..3f51de70fb3 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.h +++ b/src/crimson/os/seastore/journal/segmented_journal.h @@ -44,9 +44,11 @@ public: close_ertr::future<> close() final; - submit_record_ret submit_record( + submit_record_ertr::future<> submit_record( record_t &&record, - OrderingHandle &handle) final; + OrderingHandle &handle, + transaction_type_t t_src, + on_submission_func_t &&on_submission) final; seastar::future<> flush(OrderingHandle &handle) final; @@ -59,9 +61,6 @@ public: backend_type_t get_type() final { return backend_type_t::SEGMENTED; } - seastar::future<> finish_commit(transaction_type_t type) { - return seastar::now(); - } bool is_checksum_needed() final { // segmented journal always requires checksum @@ -69,10 +68,10 @@ public: } private: - submit_record_ret do_submit_record( + submit_record_ertr::future<> do_submit_record( record_t &&record, - OrderingHandle &handle - ); + OrderingHandle &handle, + on_submission_func_t &&on_submission); SegmentSeqAllocatorRef segment_seq_allocator; SegmentAllocator journal_segment_allocator; diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index a050b2cdf47..9a34bf56157 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -19,6 +19,7 @@ #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/lba_mapping.h" namespace crimson::os::seastore { diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index b7a1d8f8ba9..888d3c359ac 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -52,28 +52,22 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node< ceph_assert(lba_root->is_initial_pending() == root_block->is_pending()); return {true, - trans_intr::make_interruptible( - c.cache.get_extent_viewable_by_trans(c.trans, lba_root))}; + c.cache.get_extent_viewable_by_trans(c.trans, lba_root)}; } else if (root_block->is_pending()) { auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance()); lba_root = prior.lba_root_node; if (lba_root) { return {true, - trans_intr::make_interruptible( - c.cache.get_extent_viewable_by_trans(c.trans, lba_root))}; + c.cache.get_extent_viewable_by_trans(c.trans, lba_root)}; } else { c.cache.account_absent_access(c.trans.get_src()); return {false, - trans_intr::make_interruptible( - Cache::get_extent_ertr::make_ready_future< - CachedExtentRef>())}; + Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()}; } } else { c.cache.account_absent_access(c.trans.get_src()); return {false, - trans_intr::make_interruptible( - Cache::get_extent_ertr::make_ready_future< - CachedExtentRef>())}; + Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()}; } } @@ -100,6 +94,45 @@ void unlink_phy_tree_root_node<laddr_t>(RootBlockRef &root_block) { namespace crimson::os::seastore::lba_manager::btree { +get_child_ret_t<LogicalCachedExtent> +BtreeLBAMapping::get_logical_extent(Transaction &t) +{ + ceph_assert(is_parent_viewable()); + assert(pos != std::numeric_limits<uint16_t>::max()); + ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id()); + auto &p = static_cast<LBALeafNode&>(*parent); + auto k = this->is_indirect() + ? this->get_intermediate_base() + : get_key(); + auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k); + if (!v.has_child()) { + this->child_pos = v.get_child_pos(); + } + return v; +} + +bool BtreeLBAMapping::is_stable() const +{ + assert(!this->parent_modified()); + assert(pos != std::numeric_limits<uint16_t>::max()); + auto &p = static_cast<LBALeafNode&>(*parent); + auto k = this->is_indirect() + ? this->get_intermediate_base() + : get_key(); + return p.is_child_stable(ctx, pos, k); +} + +bool BtreeLBAMapping::is_data_stable() const +{ + assert(!this->parent_modified()); + assert(pos != std::numeric_limits<uint16_t>::max()); + auto &p = static_cast<LBALeafNode&>(*parent); + auto k = this->is_indirect() + ? this->get_intermediate_base() + : get_key(); + return p.is_child_data_stable(ctx, pos, k); +} + BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( Transaction &t) diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index ef10ff9623b..e0902053d0e 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -23,11 +23,15 @@ #include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" #include "crimson/os/seastore/btree/btree_range_pin.h" +namespace crimson::os::seastore { +class LogicalCachedExtent; +} + namespace crimson::os::seastore::lba_manager::btree { struct LBALeafNode; -class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { +class BtreeLBAMapping : public LBAMapping { // To support cloning, there are two kinds of lba mappings: // 1. physical lba mapping: the pladdr in the value of which is the paddr of // the corresponding extent; @@ -61,14 +65,14 @@ class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { // their keys. public: BtreeLBAMapping(op_context_t<laddr_t> ctx) - : BtreeNodeMapping(ctx) {} + : LBAMapping(ctx) {} BtreeLBAMapping( op_context_t<laddr_t> c, LBALeafNodeRef parent, uint16_t pos, lba_map_val_t &val, lba_node_meta_t meta) - : BtreeNodeMapping( + : LBAMapping( c, parent, pos, @@ -190,8 +194,12 @@ public: SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin)); return new_pin; } + bool is_stable() const final; + bool is_data_stable() const final; + get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction &t); + protected: - std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate( + LBAMappingRef _duplicate( op_context_t<laddr_t> ctx) const final { auto pin = std::unique_ptr<BtreeLBAMapping>(new BtreeLBAMapping(ctx)); pin->key = key; diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index ad5d336815b..524bf23dd58 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -259,7 +259,7 @@ struct LBALeafNode } // See LBAInternalNode, same concept - void resolve_relative_addrs(paddr_t base); + void resolve_relative_addrs(paddr_t base) final; void node_resolve_vals( internal_iterator_t from, internal_iterator_t to) const final diff --git a/src/crimson/os/seastore/lba_mapping.cc b/src/crimson/os/seastore/lba_mapping.cc new file mode 100644 index 00000000000..90fae09ce21 --- /dev/null +++ b/src/crimson/os/seastore/lba_mapping.cc @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "lba_mapping.h" + +namespace crimson::os::seastore { + +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) +{ + out << "LBAMapping(" << rhs.get_key() + << "~0x" << std::hex << rhs.get_length() << std::dec + << "->" << rhs.get_val(); + if (rhs.is_indirect()) { + out << ",indirect(" << rhs.get_intermediate_base() + << "~0x" << std::hex << rhs.get_intermediate_length() + << "@0x" << rhs.get_intermediate_offset() << std::dec + << ")"; + } + out << ")"; + return out; +} + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs) +{ + bool first = true; + out << '['; + for (const auto &i: rhs) { + out << (first ? "" : ",") << *i; + first = false; + } + return out << ']'; +} + +LBAMappingRef LBAMapping::duplicate() const { + auto ret = _duplicate(ctx); + ret->range = range; + ret->value = value; + ret->parent = parent; + ret->len = len; + ret->pos = pos; + return ret; +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h new file mode 100644 index 00000000000..338d4d53f55 --- /dev/null +++ b/src/crimson/os/seastore/lba_mapping.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/btree/btree_range_pin.h" + +namespace crimson::os::seastore { + +class LBAMapping; +using LBAMappingRef = std::unique_ptr<LBAMapping>; + +class LogicalCachedExtent; + +class LBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> { +public: + LBAMapping(op_context_t<laddr_t> ctx) + : BtreeNodeMapping<laddr_t, paddr_t>(ctx) {} + template <typename... T> + LBAMapping(T&&... t) + : BtreeNodeMapping<laddr_t, paddr_t>(std::forward<T>(t)...) + { + if (!parent->is_pending()) { + this->child_pos = {parent, pos}; + } + } + + // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h + virtual bool is_indirect() const = 0; + virtual laddr_t get_intermediate_key() const = 0; + virtual laddr_t get_intermediate_base() const = 0; + virtual extent_len_t get_intermediate_length() const = 0; + // The start offset of the pin, must be 0 if the pin is not indirect + virtual extent_len_t get_intermediate_offset() const = 0; + + virtual get_child_ret_t<LogicalCachedExtent> + get_logical_extent(Transaction &t) = 0; + + void link_child(ChildableCachedExtent *c) { + ceph_assert(child_pos); + child_pos->link_child(c); + } + virtual LBAMappingRef refresh_with_pending_parent() = 0; + + // For reserved mappings, the return values are + // undefined although it won't crash + virtual bool is_stable() const = 0; + virtual bool is_data_stable() const = 0; + virtual bool is_clone() const = 0; + bool is_zero_reserved() const { + return !get_val().is_real(); + } + + LBAMappingRef duplicate() const; + + virtual ~LBAMapping() {} +protected: + virtual LBAMappingRef _duplicate(op_context_t<laddr_t>) const = 0; + std::optional<child_pos_t> child_pos = std::nullopt; +}; + +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs); +using lba_pin_list_t = std::list<LBAMappingRef>; + +std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); + +} // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {}; +template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index 20f86da5d3d..8f817a521cf 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -50,6 +50,8 @@ struct extent_to_write_t { extent_to_write_t(const extent_to_write_t &) = delete; extent_to_write_t(extent_to_write_t &&) = default; + extent_to_write_t& operator=(const extent_to_write_t&) = delete; + extent_to_write_t& operator=(extent_to_write_t&&) = default; bool is_data() const { return type == type_t::DATA; @@ -523,7 +525,7 @@ ObjectDataHandler::write_ret do_insertions( if (region.is_data()) { assert_aligned(region.len); ceph_assert(region.len == region.bl->length()); - DEBUGT("allocating extent: {}~{}", + DEBUGT("allocating extent: {}~0x{:x}", ctx.t, region.addr, region.len); @@ -554,7 +556,7 @@ ObjectDataHandler::write_ret do_insertions( ObjectDataHandler::write_iertr::pass_further{} ); } else if (region.is_zero()) { - DEBUGT("reserving: {}~{}", + DEBUGT("reserving: {}~0x{:x}", ctx.t, region.addr, region.len); @@ -696,7 +698,7 @@ public: << ", aligned_data_end=" << overwrite_plan.aligned_data_end << ", left_operation=" << overwrite_plan.left_operation << ", right_operation=" << overwrite_plan.right_operation - << ", block_size=" << overwrite_plan.block_size + << ", block_size=0x" << std::hex << overwrite_plan.block_size << std::dec << ", is_left_fresh=" << overwrite_plan.is_left_fresh << ", is_right_fresh=" << overwrite_plan.is_right_fresh << ")"; @@ -827,7 +829,7 @@ namespace crimson::os::seastore { */ using operate_ret_bare = std::pair< std::optional<extent_to_write_t>, - std::optional<bufferptr>>; + std::optional<ceph::bufferlist>>; using operate_ret = get_iertr::future<operate_ret_bare>; operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan) { @@ -839,19 +841,26 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan if (overwrite_plan.left_operation == overwrite_operation_t::OVERWRITE_ZERO) { assert(pin->get_val().is_zero()); + auto zero_extent_len = overwrite_plan.get_left_extent_size(); assert_aligned(zero_extent_len); + std::optional<extent_to_write_t> extent_to_write; + if (zero_extent_len != 0) { + extent_to_write = extent_to_write_t::create_zero( + overwrite_plan.pin_begin, zero_extent_len); + } + auto zero_prepend_len = overwrite_plan.get_left_alignment_size(); + std::optional<ceph::bufferlist> prepend_bl; + if (zero_prepend_len != 0) { + ceph::bufferlist zero_bl; + zero_bl.append_zero(zero_prepend_len); + prepend_bl = std::move(zero_bl); + } + return get_iertr::make_ready_future<operate_ret_bare>( - (zero_extent_len == 0 - ? std::nullopt - : std::make_optional(extent_to_write_t::create_zero( - overwrite_plan.pin_begin, zero_extent_len))), - (zero_prepend_len == 0 - ? std::nullopt - : std::make_optional(bufferptr( - ceph::buffer::create(zero_prepend_len, 0)))) - ); + std::move(extent_to_write), + std::move(prepend_bl)); } else if (overwrite_plan.left_operation == overwrite_operation_t::MERGE_EXISTING) { auto prepend_len = overwrite_plan.get_left_size(); if (prepend_len == 0) { @@ -859,16 +868,15 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan std::nullopt, std::nullopt); } else { - extent_len_t off = pin->get_intermediate_offset(); return ctx.tm.read_pin<ObjectDataBlock>( ctx.t, pin->duplicate() - ).si_then([prepend_len, off](auto left_extent) { + ).si_then([prepend_len](auto maybe_indirect_left_extent) { + auto read_bl = maybe_indirect_left_extent.get_bl(); + ceph::bufferlist prepend_bl; + prepend_bl.substr_of(read_bl, 0, prepend_len); return get_iertr::make_ready_future<operate_ret_bare>( std::nullopt, - std::make_optional(bufferptr( - left_extent->get_bptr(), - off, - prepend_len))); + std::move(prepend_bl)); }); } } else { @@ -888,18 +896,17 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan std::move(left_to_write_extent), std::nullopt); } else { - extent_len_t off = pin->get_intermediate_offset(); return ctx.tm.read_pin<ObjectDataBlock>( ctx.t, pin->duplicate() - ).si_then([prepend_offset=extent_len + off, prepend_len, + ).si_then([prepend_offset=extent_len, prepend_len, left_to_write_extent=std::move(left_to_write_extent)] - (auto left_extent) mutable { + (auto left_maybe_indirect_extent) mutable { + auto read_bl = left_maybe_indirect_extent.get_bl(); + ceph::bufferlist prepend_bl; + prepend_bl.substr_of(read_bl, prepend_offset, prepend_len); return get_iertr::make_ready_future<operate_ret_bare>( std::move(left_to_write_extent), - std::make_optional(bufferptr( - left_extent->get_bptr(), - prepend_offset, - prepend_len))); + std::move(prepend_bl)); }); } } @@ -922,19 +929,26 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla assert(overwrite_plan.data_end >= right_pin_begin); if (overwrite_plan.right_operation == overwrite_operation_t::OVERWRITE_ZERO) { assert(pin->get_val().is_zero()); + auto zero_suffix_len = overwrite_plan.get_right_alignment_size(); + std::optional<ceph::bufferlist> suffix_bl; + if (zero_suffix_len != 0) { + ceph::bufferlist zero_bl; + zero_bl.append_zero(zero_suffix_len); + suffix_bl = std::move(zero_bl); + } + auto zero_extent_len = overwrite_plan.get_right_extent_size(); assert_aligned(zero_extent_len); + std::optional<extent_to_write_t> extent_to_write; + if (zero_extent_len != 0) { + extent_to_write = extent_to_write_t::create_zero( + overwrite_plan.aligned_data_end, zero_extent_len); + } + return get_iertr::make_ready_future<operate_ret_bare>( - (zero_extent_len == 0 - ? std::nullopt - : std::make_optional(extent_to_write_t::create_zero( - overwrite_plan.aligned_data_end, zero_extent_len))), - (zero_suffix_len == 0 - ? std::nullopt - : std::make_optional(bufferptr( - ceph::buffer::create(zero_suffix_len, 0)))) - ); + std::move(extent_to_write), + std::move(suffix_bl)); } else if (overwrite_plan.right_operation == overwrite_operation_t::MERGE_EXISTING) { auto append_len = overwrite_plan.get_right_size(); if (append_len == 0) { @@ -944,17 +958,17 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla } else { auto append_offset = overwrite_plan.data_end.get_byte_distance< - extent_len_t>(right_pin_begin) - + pin->get_intermediate_offset(); + extent_len_t>(right_pin_begin); return ctx.tm.read_pin<ObjectDataBlock>( ctx.t, pin->duplicate() - ).si_then([append_offset, append_len](auto right_extent) { + ).si_then([append_offset, append_len] + (auto right_maybe_indirect_extent) { + auto read_bl = right_maybe_indirect_extent.get_bl(); + ceph::bufferlist suffix_bl; + suffix_bl.substr_of(read_bl, append_offset, append_len); return get_iertr::make_ready_future<operate_ret_bare>( std::nullopt, - std::make_optional(bufferptr( - right_extent->get_bptr(), - append_offset, - append_len))); + std::move(suffix_bl)); }); } } else { @@ -976,19 +990,18 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla } else { auto append_offset = overwrite_plan.data_end.get_byte_distance< - extent_len_t>(right_pin_begin) - + pin->get_intermediate_offset(); + extent_len_t>(right_pin_begin); return ctx.tm.read_pin<ObjectDataBlock>( ctx.t, pin->duplicate() ).si_then([append_offset, append_len, right_to_write_extent=std::move(right_to_write_extent)] - (auto right_extent) mutable { + (auto maybe_indirect_right_extent) mutable { + auto read_bl = maybe_indirect_right_extent.get_bl(); + ceph::bufferlist suffix_bl; + suffix_bl.substr_of(read_bl, append_offset, append_len); return get_iertr::make_ready_future<operate_ret_bare>( std::move(right_to_write_extent), - std::make_optional(bufferptr( - right_extent->get_bptr(), - append_offset, - append_len))); + std::move(suffix_bl)); }); } } @@ -1046,13 +1059,13 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation( ceph_assert(size <= max_object_size); if (!object_data.is_null()) { ceph_assert(object_data.get_reserved_data_len() == max_object_size); - DEBUGT("reservation present: {}~{}", + DEBUGT("reservation present: {}~0x{:x}", ctx.t, object_data.get_reserved_data_base(), object_data.get_reserved_data_len()); return write_iertr::now(); } else { - DEBUGT("reserving: {}~{}", + DEBUGT("reserving: {}~0x{:x}", ctx.t, ctx.onode.get_data_hint(), max_object_size); @@ -1085,7 +1098,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( LOG_PREFIX(ObjectDataHandler::trim_data_reservation); auto data_base = object_data.get_reserved_data_base(); auto data_len = object_data.get_reserved_data_len(); - DEBUGT("object_data: {}~{}", ctx.t, data_base, data_len); + DEBUGT("object_data: {}~0x{:x}", ctx.t, data_base, data_len); laddr_t aligned_start = (data_base + size).get_aligned_laddr(); loffset_t aligned_length = data_len - aligned_start.get_byte_distance<loffset_t>(data_base); @@ -1121,7 +1134,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( if (append_len == 0) { LOG_PREFIX(ObjectDataHandler::trim_data_reservation); TRACET("First pin overlaps the boundary and has aligned data" - "create existing at addr:{}, len:{}", + "create existing at addr:{}, len:0x{:x}", ctx.t, pin.get_key(), size - pin_offset); to_write.push_back(extent_to_write_t::create_existing( pin.duplicate(), @@ -1136,22 +1149,18 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( ctx.t, pin.duplicate() ).si_then([ctx, size, pin_offset, append_len, roundup_size, - &pin, &object_data, &to_write](auto extent) { - bufferlist bl; - bl.append( - bufferptr( - extent->get_bptr(), - pin.get_intermediate_offset(), - size - pin_offset - )); - bl.append_zero(append_len); + &pin, &object_data, &to_write](auto maybe_indirect_extent) { + auto read_bl = maybe_indirect_extent.get_bl(); + ceph::bufferlist write_bl; + write_bl.substr_of(read_bl, 0, size - pin_offset); + write_bl.append_zero(append_len); LOG_PREFIX(ObjectDataHandler::trim_data_reservation); TRACET("First pin overlaps the boundary and has unaligned data" - "create data at addr:{}, len:{}", - ctx.t, pin.get_key(), bl.length()); + "create data at addr:{}, len:0x{:x}", + ctx.t, pin.get_key(), write_bl.length()); to_write.push_back(extent_to_write_t::create_data( pin.get_key(), - bl)); + write_bl)); to_write.push_back(extent_to_write_t::create_zero( (object_data.get_reserved_data_base() + roundup_size).checked_to_laddr(), object_data.get_reserved_data_len() - roundup_size)); @@ -1184,44 +1193,45 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( * get_to_writes_with_zero_buffer * * Returns extent_to_write_t's reflecting a zero region extending - * from offset~len with headptr optionally on the left and tailptr + * from offset~len with headbl optionally on the left and tailbl * optionally on the right. */ extent_to_write_list_t get_to_writes_with_zero_buffer( laddr_t data_base, const extent_len_t block_size, objaddr_t offset, extent_len_t len, - std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr) + std::optional<ceph::bufferlist> &&headbl, + std::optional<ceph::bufferlist> &&tailbl) { auto zero_left = p2roundup(offset, (objaddr_t)block_size); auto zero_right = p2align(offset + len, (objaddr_t)block_size); - auto left = headptr ? (offset - headptr->length()) : offset; - auto right = tailptr ? - (offset + len + tailptr->length()) : + auto left = headbl ? (offset - headbl->length()) : offset; + auto right = tailbl ? + (offset + len + tailbl->length()) : (offset + len); assert( - (headptr && ((zero_left - left) == - p2roundup(headptr->length(), block_size))) ^ - (!headptr && (zero_left == left))); + (headbl && ((zero_left - left) == + p2roundup(headbl->length(), block_size))) ^ + (!headbl && (zero_left == left))); assert( - (tailptr && ((right - zero_right) == - p2roundup(tailptr->length(), block_size))) ^ - (!tailptr && (right == zero_right))); + (tailbl && ((right - zero_right) == + p2roundup(tailbl->length(), block_size))) ^ + (!tailbl && (right == zero_right))); assert(right > left); // zero region too small for a reserved section, - // headptr and tailptr in same extent + // headbl and tailbl in same extent if (zero_right <= zero_left) { bufferlist bl; - if (headptr) { - bl.append(*headptr); + if (headbl) { + bl.append(*headbl); } bl.append_zero( - right - left - bl.length() - (tailptr ? tailptr->length() : 0)); - if (tailptr) { - bl.append(*tailptr); + right - left - bl.length() - (tailbl ? tailbl->length() : 0)); + if (tailbl) { + bl.append(*tailbl); } assert(bl.length() % block_size == 0); assert(bl.length() == (right - left)); @@ -1230,16 +1240,16 @@ extent_to_write_list_t get_to_writes_with_zero_buffer( (data_base + left).checked_to_laddr(), bl)); return ret; } else { - // reserved section between ends, headptr and tailptr in different extents + // reserved section between ends, headbl and tailbl in different extents extent_to_write_list_t ret; - if (headptr) { - bufferlist headbl; - headbl.append(*headptr); - headbl.append_zero(zero_left - left - headbl.length()); - assert(headbl.length() % block_size == 0); - assert(headbl.length() > 0); + if (headbl) { + bufferlist head_zero_bl; + head_zero_bl.append(*headbl); + head_zero_bl.append_zero(zero_left - left - head_zero_bl.length()); + assert(head_zero_bl.length() % block_size == 0); + assert(head_zero_bl.length() > 0); ret.push_back(extent_to_write_t::create_data( - (data_base + left).checked_to_laddr(), headbl)); + (data_base + left).checked_to_laddr(), head_zero_bl)); } // reserved zero region ret.push_back(extent_to_write_t::create_zero( @@ -1247,14 +1257,14 @@ extent_to_write_list_t get_to_writes_with_zero_buffer( zero_right - zero_left)); assert(ret.back().len % block_size == 0); assert(ret.back().len > 0); - if (tailptr) { - bufferlist tailbl; - tailbl.append(*tailptr); - tailbl.append_zero(right - zero_right - tailbl.length()); - assert(tailbl.length() % block_size == 0); - assert(tailbl.length() > 0); + if (tailbl) { + bufferlist tail_zero_bl; + tail_zero_bl.append(*tailbl); + tail_zero_bl.append_zero(right - zero_right - tail_zero_bl.length()); + assert(tail_zero_bl.length() % block_size == 0); + assert(tail_zero_bl.length() > 0); ret.push_back(extent_to_write_t::create_data( - (data_base + zero_right).checked_to_laddr(), tailbl)); + (data_base + zero_right).checked_to_laddr(), tail_zero_bl)); } return ret; } @@ -1293,7 +1303,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( (auto &pins, auto &to_write) mutable { LOG_PREFIX(ObjectDataHandler::overwrite); - DEBUGT("overwrite: {}~{}", + DEBUGT("overwrite: 0x{:x}~0x{:x}", ctx.t, offset, len); @@ -1306,13 +1316,13 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( overwrite_plan ).si_then([ctx, data_base, len, offset, overwrite_plan, bl=std::move(bl), &to_write, &pins, this](auto p) mutable { - auto &[left_extent, headptr] = p; + auto &[left_extent, headbl] = p; if (left_extent) { ceph_assert(left_extent->addr == overwrite_plan.pin_begin); append_extent_to_write(to_write, std::move(*left_extent)); } - if (headptr) { - assert(headptr->length() > 0); + if (headbl) { + assert(headbl->length() > 0); } return operate_right( ctx, @@ -1321,19 +1331,19 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( ).si_then([ctx, data_base, len, offset, pin_begin=overwrite_plan.pin_begin, pin_end=overwrite_plan.pin_end, - bl=std::move(bl), headptr=std::move(headptr), + bl=std::move(bl), headbl=std::move(headbl), &to_write, &pins, this](auto p) mutable { - auto &[right_extent, tailptr] = p; + auto &[right_extent, tailbl] = p; if (bl.has_value()) { auto write_offset = offset; bufferlist write_bl; - if (headptr) { - write_bl.append(*headptr); - write_offset = write_offset - headptr->length(); + if (headbl) { + write_bl.append(*headbl); + write_offset = write_offset - headbl->length(); } write_bl.claim_append(*bl); - if (tailptr) { - write_bl.append(*tailptr); + if (tailbl) { + write_bl.append(*tailbl); assert_aligned(write_bl.length()); } splice_extent_to_write( @@ -1347,8 +1357,8 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite( ctx.tm.get_block_size(), offset, len, - std::move(headptr), - std::move(tailptr))); + std::move(headbl), + std::move(tailbl))); } if (right_extent) { ceph_assert(right_extent->get_end_addr() == pin_end); @@ -1383,7 +1393,7 @@ ObjectDataHandler::zero_ret ObjectDataHandler::zero( ctx, [this, ctx, offset, len](auto &object_data) { LOG_PREFIX(ObjectDataHandler::zero); - DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}", + DEBUGT("zero to 0x{:x}~0x{:x}, object_data: {}~0x{:x}, is_null {}", ctx.t, offset, len, @@ -1424,7 +1434,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::write( ctx, [this, ctx, offset, &bl](auto &object_data) { LOG_PREFIX(ObjectDataHandler::write); - DEBUGT("writing to {}~{}, object_data: {}~{}, is_null {}", + DEBUGT("writing to 0x{:x}~0x{:x}, object_data: {}~0x{:x}, is_null {}", ctx.t, offset, bl.length(), @@ -1469,7 +1479,7 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( ctx, [ctx, obj_offset, len, &ret](const auto &object_data) { LOG_PREFIX(ObjectDataHandler::read); - DEBUGT("reading {}~{}", + DEBUGT("reading {}~0x{:x}", ctx.t, object_data.get_reserved_data_base(), object_data.get_reserved_data_len()); @@ -1501,83 +1511,74 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( pins, [FNAME, ctx, l_start, l_end, &l_current, &ret](auto &pin) -> read_iertr::future<> { - auto pin_key = pin->get_key(); - if (l_current == l_start) { - ceph_assert(l_current >= pin_key); - } else { + auto pin_start = pin->get_key(); + extent_len_t read_start; + extent_len_t read_start_aligned; + if (l_current == l_start) { // first pin may skip head + ceph_assert(l_current.get_aligned_laddr() >= pin_start); + read_start = l_current.template + get_byte_distance<extent_len_t>(pin_start); + read_start_aligned = p2align(read_start, ctx.tm.get_block_size()); + } else { // non-first pin must match start assert(l_current > l_start); - ceph_assert(l_current == pin_key); + ceph_assert(l_current == pin_start); + read_start = 0; + read_start_aligned = 0; } + ceph_assert(l_current < l_end); auto pin_len = pin->get_length(); assert(pin_len > 0); - laddr_offset_t l_pin_end = pin_key + pin_len; - ceph_assert(l_current < l_pin_end); - laddr_offset_t l_current_end = std::min(l_pin_end, l_end); + laddr_offset_t pin_end = pin_start + pin_len; + assert(l_current < pin_end); + laddr_offset_t l_current_end = std::min(pin_end, l_end); + extent_len_t read_len = + l_current_end.get_byte_distance<extent_len_t>(l_current); + if (pin->get_val().is_zero()) { - DEBUGT("got {}~{} from zero-pin {}~{}", + DEBUGT("got {}~0x{:x} from zero-pin {}~0x{:x}", ctx.t, l_current, - l_current_end.get_byte_distance<loffset_t>(l_current), - pin_key, + read_len, + pin_start, pin_len); - ret.append_zero( - l_current_end.get_byte_distance< - extent_len_t>(l_current)); + ret.append_zero(read_len); l_current = l_current_end; return seastar::now(); } // non-zero pin - bool is_indirect = pin->is_indirect(); - laddr_t e_key; - extent_len_t e_len; - extent_len_t e_off; - if (is_indirect) { - e_key = pin->get_intermediate_base(); - e_len = pin->get_intermediate_length(); - e_off = pin->get_intermediate_offset(); - DEBUGT("reading {}~{} from indirect-pin {}~{}, direct-pin {}~{}(off={})", - ctx.t, - l_current, - l_current_end.get_byte_distance<extent_len_t>(l_current), - pin_key, - pin_len, - e_key, - e_len, - e_off); - assert(e_key <= pin->get_intermediate_key()); - assert(e_off + pin_len <= e_len); - } else { - DEBUGT("reading {}~{} from pin {}~{}", - ctx.t, - l_current, - l_current_end.get_byte_distance< - extent_len_t>(l_current), - pin_key, - pin_len); - e_key = pin_key; - e_len = pin_len; - e_off = 0; - } - extent_len_t e_current_off = (l_current + e_off) - .template get_byte_distance<extent_len_t>(pin_key); + laddr_t l_current_end_aligned = l_current_end.get_roundup_laddr(); + extent_len_t read_len_aligned = + l_current_end_aligned.get_byte_distance<extent_len_t>(pin_start); + read_len_aligned -= read_start_aligned; + extent_len_t unalign_start_offset = read_start - read_start_aligned; + DEBUGT("reading {}~0x{:x} from pin {}~0x{:x}", + ctx.t, + l_current, + read_len, + pin_start, + pin_len); return ctx.tm.read_pin<ObjectDataBlock>( ctx.t, - std::move(pin) + std::move(pin), + read_start_aligned, + read_len_aligned ).si_then([&ret, &l_current, l_current_end, -#ifndef NDEBUG - e_key, e_len, e_current_off](auto extent) { -#else - e_current_off](auto extent) { -#endif - assert(e_key == extent->get_laddr()); - assert(e_len == extent->get_length()); - ret.append( - bufferptr( - extent->get_bptr(), - e_current_off, - l_current_end.get_byte_distance<extent_len_t>(l_current))); + read_start_aligned, read_len_aligned, + unalign_start_offset, read_len](auto maybe_indirect_extent) { + auto aligned_bl = maybe_indirect_extent.get_range( + read_start_aligned, read_len_aligned); + if (read_len < read_len_aligned) { + ceph::bufferlist unaligned_bl; + unaligned_bl.substr_of( + aligned_bl, unalign_start_offset, read_len); + ret.append(std::move(unaligned_bl)); + } else { + assert(read_len == read_len_aligned); + assert(unalign_start_offset == 0); + ret.append(std::move(aligned_bl)); + } l_current = l_current_end; return seastar::now(); }).handle_error_interruptible( @@ -1608,7 +1609,7 @@ ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap( [ctx, obj_offset, len, &ret](const auto &object_data) { LOG_PREFIX(ObjectDataHandler::fiemap); DEBUGT( - "{}~{}, reservation {}~{}", + "0x{:x}~0x{:x}, reservation {}~0x{:x}", ctx.t, obj_offset, len, @@ -1663,7 +1664,7 @@ ObjectDataHandler::truncate_ret ObjectDataHandler::truncate( ctx, [this, ctx, offset](auto &object_data) { LOG_PREFIX(ObjectDataHandler::truncate); - DEBUGT("truncating {}~{} offset: {}", + DEBUGT("truncating {}~0x{:x} offset: 0x{:x}", ctx.t, object_data.get_reserved_data_base(), object_data.get_reserved_data_len(), @@ -1706,7 +1707,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents( laddr_t data_base) { LOG_PREFIX(ObjectDataHandler::clone_extents); - TRACET(" object_data: {}~{}, data_base: {}", + TRACET("object_data: {}~0x{:x}, data_base: 0x{:x}", ctx.t, object_data.get_reserved_data_base(), object_data.get_reserved_data_len(), @@ -1791,7 +1792,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone( auto len = object_data.get_reserved_data_len(); object_data.clear(); LOG_PREFIX(ObjectDataHandler::clone); - DEBUGT("cloned obj reserve_data_base: {}, len {}", + DEBUGT("cloned obj reserve_data_base: {}, len 0x{:x}", ctx.t, d_object_data.get_reserved_data_base(), d_object_data.get_reserved_data_len()); @@ -1801,7 +1802,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone( d_object_data.get_reserved_data_len() ).si_then([&d_object_data, ctx, &object_data, base, len, this] { LOG_PREFIX("ObjectDataHandler::clone"); - DEBUGT("head obj reserve_data_base: {}, len {}", + DEBUGT("head obj reserve_data_base: {}, len 0x{:x}", ctx.t, object_data.get_reserved_data_base(), object_data.get_reserved_data_len()); diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h index 795daeddb11..7c2392731c0 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h @@ -48,7 +48,8 @@ struct OMapNode : LogicalCachedExtent { need_merge(n_merge) {} }; - OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {} + explicit OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {} + explicit OMapNode(extent_len_t length) : LogicalCachedExtent(length) {} OMapNode(const OMapNode &other) : LogicalCachedExtent(other) {} diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc index 8d06accef1e..df97f394a0d 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc @@ -734,23 +734,28 @@ omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth) { ceph_assert(depth > 0); if (depth > 1) { - return oc.tm.read_extent<OMapInnerNode>(oc.t, laddr, - OMAP_INNER_BLOCK_SIZE) - .handle_error_interruptible( + return oc.tm.read_extent<OMapInnerNode>( + oc.t, laddr, OMAP_INNER_BLOCK_SIZE + ).handle_error_interruptible( omap_load_extent_iertr::pass_further{}, crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } - ).si_then( - [](auto&& e) { - return seastar::make_ready_future<OMapNodeRef>(std::move(e)); + ).si_then([](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + return seastar::make_ready_future<OMapNodeRef>( + std::move(maybe_indirect_extent.extent)); }); } else { - return oc.tm.read_extent<OMapLeafNode>(oc.t, laddr, OMAP_LEAF_BLOCK_SIZE + return oc.tm.read_extent<OMapLeafNode>( + oc.t, laddr, OMAP_LEAF_BLOCK_SIZE ).handle_error_interruptible( omap_load_extent_iertr::pass_further{}, crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" } - ).si_then( - [](auto&& e) { - return seastar::make_ready_future<OMapNodeRef>(std::move(e)); + ).si_then([](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + return seastar::make_ready_future<OMapNodeRef>( + std::move(maybe_indirect_extent.extent)); }); } } diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h index a2b51bbb0e1..2267942f035 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h @@ -31,10 +31,18 @@ struct OMapInnerNode StringKVInnerNodeLayout { using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>; using internal_iterator_t = const_iterator; - template <typename... T> - OMapInnerNode(T&&... t) : - OMapNode(std::forward<T>(t)...), - StringKVInnerNodeLayout(get_bptr().c_str()) {} + + explicit OMapInnerNode(ceph::bufferptr &&ptr) + : OMapNode(std::move(ptr)) { + this->set_layout_buf(this->get_bptr().c_str()); + } + // Must be identical with OMapInnerNode(ptr) after on_fully_loaded() + explicit OMapInnerNode(extent_len_t length) + : OMapNode(length) {} + OMapInnerNode(const OMapInnerNode &rhs) + : OMapNode(rhs) { + this->set_layout_buf(this->get_bptr().c_str()); + } omap_node_meta_t get_node_meta() const final { return get_meta(); } bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const { @@ -46,6 +54,10 @@ struct OMapInnerNode bool extent_is_below_min() const { return below_min(); } uint32_t get_node_size() { return get_size(); } + void on_fully_loaded() final { + this->set_layout_buf(this->get_bptr().c_str()); + } + CachedExtentRef duplicate_for_write(Transaction&) final { assert(delta_buffer.empty()); return CachedExtentRef(new OMapInnerNode(*this)); @@ -148,10 +160,18 @@ struct OMapLeafNode using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>; using internal_iterator_t = const_iterator; - template <typename... T> - OMapLeafNode(T&&... t) : - OMapNode(std::forward<T>(t)...), - StringKVLeafNodeLayout(get_bptr().c_str()) {} + + explicit OMapLeafNode(ceph::bufferptr &&ptr) + : OMapNode(std::move(ptr)) { + this->set_layout_buf(this->get_bptr().c_str()); + } + // Must be identical with OMapLeafNode(ptr) after on_fully_loaded() + explicit OMapLeafNode(extent_len_t length) + : OMapNode(length) {} + OMapLeafNode(const OMapLeafNode &rhs) + : OMapNode(rhs) { + this->set_layout_buf(this->get_bptr().c_str()); + } omap_node_meta_t get_node_meta() const final { return get_meta(); } bool extent_will_overflow( @@ -164,6 +184,10 @@ struct OMapLeafNode bool extent_is_below_min() const { return below_min(); } uint32_t get_node_size() { return get_size(); } + void on_fully_loaded() final { + this->set_layout_buf(this->get_bptr().c_str()); + } + CachedExtentRef duplicate_for_write(Transaction&) final { assert(delta_buffer.empty()); return CachedExtentRef(new OMapLeafNode(*this)); diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h index 72b13fedfb1..3825ebef145 100644 --- a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h +++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h @@ -504,8 +504,13 @@ public: inner_remove(iter); } - StringKVInnerNodeLayout(char *buf) : - buf(buf) {} + StringKVInnerNodeLayout() : buf(nullptr) {} + + void set_layout_buf(char *_buf) { + assert(buf == nullptr); + assert(_buf != nullptr); + buf = _buf; + } uint32_t get_size() const { ceph_le32 &size = *layout.template Pointer<0>(buf); @@ -1120,8 +1125,13 @@ public: leaf_remove(iter); } - StringKVLeafNodeLayout(char *buf) : - buf(buf) {} + StringKVLeafNodeLayout() : buf(nullptr) {} + + void set_layout_buf(char *_buf) { + assert(buf == nullptr); + assert(_buf != nullptr); + buf = _buf; + } const_iterator iter_begin() const { return const_iterator( diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc index f3fd6eb18a5..dc8f6e87c8e 100644 --- a/src/crimson/os/seastore/onode.cc +++ b/src/crimson/os/seastore/onode.cc @@ -11,7 +11,7 @@ std::ostream& operator<<(std::ostream &out, const Onode &rhs) auto &layout = rhs.get_layout(); return out << "Onode(" << "hobj=" << rhs.hobj << ", " - << "size=" << static_cast<uint32_t>(layout.size) + << "size=0x" << std::hex << static_cast<uint32_t>(layout.size) << std::dec << ")"; } diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h index 072c57864be..fa2ed65c0f3 100644 --- a/src/crimson/os/seastore/onode.h +++ b/src/crimson/os/seastore/onode.h @@ -36,8 +36,8 @@ struct onode_layout_t { object_data_le_t object_data; - char oi[MAX_OI_LENGTH]; - char ss[MAX_SS_LENGTH]; + char oi[MAX_OI_LENGTH] = {0}; + char ss[MAX_SS_LENGTH] = {0}; } __attribute__((packed)); class Transaction; diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h index 9230051cc50..04b959f767d 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -41,8 +41,10 @@ class SeastoreSuper final: public Super { class SeastoreNodeExtent final: public NodeExtent { public: - SeastoreNodeExtent(ceph::bufferptr &&ptr) + explicit SeastoreNodeExtent(ceph::bufferptr &&ptr) : NodeExtent(std::move(ptr)) {} + explicit SeastoreNodeExtent(extent_len_t length) + : NodeExtent(length) {} SeastoreNodeExtent(const SeastoreNodeExtent& other) : NodeExtent(other) {} ~SeastoreNodeExtent() override = default; @@ -111,10 +113,14 @@ class SeastoreNodeExtentManager final: public TransactionManagerHandle { } } return tm.read_extent<SeastoreNodeExtent>(t, addr - ).si_then([addr, &t](auto&& e) -> read_iertr::future<NodeExtentRef> { + ).si_then([addr, &t](auto maybe_indirect_extent) + -> read_iertr::future<NodeExtentRef> { + auto e = maybe_indirect_extent.extent; SUBTRACET(seastore_onode, "read {}B at {} -- {}", t, e->get_length(), e->get_laddr(), *e); + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); assert(e->get_laddr() == addr); std::ignore = addr; return read_iertr::make_ready_future<NodeExtentRef>(e); diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc index 9f6a566d15c..97b7902edf5 100644 --- a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc +++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc @@ -188,10 +188,10 @@ BlockRBManager::write_ertr::future<> BlockRBManager::write( void BlockRBManager::prefill_fragmented_device() { LOG_PREFIX(BlockRBManager::prefill_fragmented_device); - // the first 2 blocks must be allocated to lba root + // the first 3 blocks must be allocated to lba root // and backref root during mkfs - for (size_t block = get_block_size() * 2; - block <= get_size() - get_block_size() * 2; + for (size_t block = get_block_size() * 3; + block <= get_size() - get_block_size() * 3; block += get_block_size() * 2) { DEBUG("marking {}~{} used", get_start_rbm_addr() + block, diff --git a/src/crimson/os/seastore/record_scanner.cc b/src/crimson/os/seastore/record_scanner.cc index 5fab11505ce..172ba77577e 100644 --- a/src/crimson/os/seastore/record_scanner.cc +++ b/src/crimson/os/seastore/record_scanner.cc @@ -18,7 +18,7 @@ RecordScanner::scan_valid_records( { LOG_PREFIX(RecordScanner::scan_valid_records); initialize_cursor(cursor); - DEBUG("starting at {}, budget={}", cursor, budget); + DEBUG("starting at {}, budget=0x{:x}", cursor, budget); auto retref = std::make_unique<size_t>(0); auto &budget_used = *retref; return crimson::repeat( @@ -91,7 +91,7 @@ RecordScanner::scan_valid_records( } }().safe_then([=, &budget_used, &cursor] { if (cursor.is_complete() || budget_used >= budget) { - DEBUG("finish at {}, budget_used={}, budget={}", + DEBUG("finish at {}, budget_used=0x{:x}, budget=0x{:x}", cursor, budget_used, budget); return seastar::stop_iteration::yes; } else { @@ -112,13 +112,13 @@ RecordScanner::read_validate_record_metadata( paddr_t start = cursor.seq.offset; auto block_size = cursor.get_block_size(); if (get_segment_off(cursor.seq.offset) + block_size > get_segment_end_offset(cursor.seq.offset)) { - DEBUG("failed -- record group header block {}~4096 > segment_size {}", - start, get_segment_end_offset(cursor.seq.offset)); + DEBUG("failed -- record group header block {}~0x{:x} > segment_size 0x{:x}", + start, block_size, get_segment_end_offset(cursor.seq.offset)); return read_validate_record_metadata_ret( read_validate_record_metadata_ertr::ready_future_marker{}, std::nullopt); } - TRACE("reading record group header block {}~4096", start); + TRACE("reading record group header block {}~0x{:x}", start, block_size); return read(start, block_size ).safe_then([this, FNAME, nonce, block_size, &cursor](bufferptr bptr) -> read_validate_record_metadata_ret { @@ -159,7 +159,7 @@ RecordScanner::read_validate_record_metadata( paddr_t rest_start = cursor.seq.offset.add_offset(block_size); auto rest_len = header.mdlength - block_size; - TRACE("reading record group header rest {}~{}", rest_start, rest_len); + TRACE("reading record group header rest {}~0x{:x}", rest_start, rest_len); return read(rest_start, rest_len ).safe_then([header=std::move(header), bl=std::move(bl) ](auto&& bptail) mutable { @@ -189,7 +189,7 @@ RecordScanner::read_validate_data_ret RecordScanner::read_validate_data( { LOG_PREFIX(RecordScanner::read_validate_data); auto data_addr = record_base.add_offset(header.mdlength); - TRACE("reading record group data blocks {}~{}", data_addr, header.dlength); + TRACE("reading record group data blocks {}~0x{:x}", data_addr, header.dlength); return read( data_addr, header.dlength @@ -220,7 +220,7 @@ RecordScanner::consume_next_records( total_length } }; - DEBUG("processing {} at {}, budget_used={}", + DEBUG("processing {} at {}, budget_used=0x{:x}", next.header, locator, budget_used); return handler( locator, diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h index 942434dd596..26b8604500d 100644 --- a/src/crimson/os/seastore/root_block.h +++ b/src/crimson/os/seastore/root_block.h @@ -41,7 +41,7 @@ struct RootBlock : CachedExtent { CachedExtent* lba_root_node = nullptr; CachedExtent* backref_root_node = nullptr; - RootBlock() : CachedExtent(zero_length_t()) {}; + RootBlock() : CachedExtent(root_construct_t()) {}; RootBlock(const RootBlock &rhs) : CachedExtent(rhs), diff --git a/src/crimson/os/seastore/root_meta.h b/src/crimson/os/seastore/root_meta.h new file mode 100644 index 00000000000..edf082f1e38 --- /dev/null +++ b/src/crimson/os/seastore/root_meta.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "crimson/os/seastore/cached_extent.h" + +namespace crimson::os::seastore { + +struct RootMetaBlock : LogicalCachedExtent { + using meta_t = std::map<std::string, std::string>; + using Ref = TCachedExtentRef<RootMetaBlock>; + static constexpr size_t SIZE = 4096; + static constexpr int MAX_META_LENGTH = 1024; + + explicit RootMetaBlock(ceph::bufferptr &&ptr) + : LogicalCachedExtent(std::move(ptr)) {} + explicit RootMetaBlock(extent_len_t length) + : LogicalCachedExtent(length) {} + RootMetaBlock(const RootMetaBlock &rhs) + : LogicalCachedExtent(rhs) {} + + CachedExtentRef duplicate_for_write(Transaction&) final { + return CachedExtentRef(new RootMetaBlock(*this)); + } + + static constexpr extent_types_t TYPE = extent_types_t::ROOT_META; + extent_types_t get_type() const final { + return extent_types_t::ROOT_META; + } + + /// dumps root meta as delta + ceph::bufferlist get_delta() final { + ceph::bufferlist bl; + ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH); + bl.append(bptr); + return bl; + } + + /// overwrites root + void apply_delta(const ceph::bufferlist &_bl) final + { + assert(_bl.length() == MAX_META_LENGTH); + ceph::bufferlist bl = _bl; + bl.rebuild(); + get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str()); + } + + meta_t get_meta() const { + bufferlist bl; + bl.append(get_bptr()); + meta_t ret; + auto iter = bl.cbegin(); + decode(ret, iter); + return ret; + } + + void set_meta(const meta_t &m) { + ceph::bufferlist bl; + encode(m, bl); + ceph_assert(bl.length() <= MAX_META_LENGTH); + bl.rebuild(); + get_bptr().zero(0, MAX_META_LENGTH); + get_bptr().copy_in(0, bl.length(), bl.front().c_str()); + } + +}; +using RootMetaBlockRef = RootMetaBlock::Ref; + +} // crimson::os::seastore + + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::os::seastore::RootMetaBlock> + : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc index d90edbb20db..6a866cb1f9b 100644 --- a/src/crimson/os/seastore/seastore.cc +++ b/src/crimson/os/seastore/seastore.cc @@ -408,6 +408,7 @@ SeaStore::Shard::mkfs_managers() return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "mkfs_seastore", + CACHE_HINT_TOUCH, [this](auto& t) { LOG_PREFIX(SeaStoreS::mkfs_managers); @@ -897,9 +898,10 @@ get_ranges(CollectionRef ch, seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> SeaStore::Shard::list_objects(CollectionRef ch, - const ghobject_t& start, - const ghobject_t& end, - uint64_t limit) const + const ghobject_t& start, + const ghobject_t& end, + uint64_t limit, + uint32_t op_flags) const { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -910,13 +912,14 @@ SeaStore::Shard::list_objects(CollectionRef ch, return seastar::do_with( RetType(std::vector<ghobject_t>(), start), std::move(limit), - [this, ch, start, end](auto& ret, auto& limit) { - return repeat_eagain([this, ch, start, end, &limit, &ret] { + [this, ch, start, end, op_flags](auto& ret, auto& limit) { + return repeat_eagain([this, ch, start, end, &limit, &ret, op_flags] { ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( Transaction::src_t::READ, "list_objects", + op_flags, [this, ch, start, end, &limit, &ret](auto &t) { LOG_PREFIX(SeaStoreS::list_objects); @@ -1054,6 +1057,7 @@ SeaStore::Shard::list_collections() return transaction_manager->with_transaction_intr( Transaction::src_t::READ, "list_collections", + CACHE_HINT_TOUCH, [this, &ret](auto& t) { LOG_PREFIX(SeaStoreS::list_collections); @@ -1137,6 +1141,7 @@ SeaStore::Shard::read( Transaction::src_t::READ, "read", op_type_t::READ, + op_flags, [this, offset, len, op_flags](auto &t, auto &onode) { return _read(t, onode, offset, len, op_flags); }).finally([this] { @@ -1148,7 +1153,8 @@ SeaStore::Shard::read( SeaStore::Shard::base_errorator::future<bool> SeaStore::Shard::exists( CollectionRef c, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { LOG_PREFIX(SeaStoreS::exists); ++(shard_stats.read_num); @@ -1160,6 +1166,7 @@ SeaStore::Shard::exists( Transaction::src_t::READ, "exists", op_type_t::READ, + op_flags, [FNAME](auto& t, auto&) { DEBUGT("exists", t); return seastar::make_ready_future<bool>(true); @@ -1240,7 +1247,8 @@ SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> SeaStore::Shard::get_attr( CollectionRef ch, const ghobject_t& oid, - std::string_view name) const + std::string_view name, + uint32_t op_flags) const { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1251,6 +1259,7 @@ SeaStore::Shard::get_attr( Transaction::src_t::READ, "get_attr", op_type_t::GET_ATTR, + op_flags, [this, name](auto &t, auto& onode) { return _get_attr(t, onode, name); }).handle_error( @@ -1296,7 +1305,8 @@ SeaStore::Shard::_get_attrs( SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t> SeaStore::Shard::get_attrs( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1307,6 +1317,7 @@ SeaStore::Shard::get_attrs( Transaction::src_t::READ, "get_attrs", op_type_t::GET_ATTRS, + op_flags, [this](auto &t, auto& onode) { return _get_attrs(t, onode); }).handle_error( @@ -1331,14 +1342,15 @@ seastar::future<struct stat> SeaStore::Shard::_stat( st.st_blksize = device->get_block_size(); st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize; st.st_nlink = 1; - DEBUGT("oid={}, size={}, blksize={}", + DEBUGT("oid={}, size=0x{:x}, blksize=0x{:x}", t, oid, st.st_size, st.st_blksize); return seastar::make_ready_future<struct stat>(st); } seastar::future<struct stat> SeaStore::Shard::stat( CollectionRef c, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1349,6 +1361,7 @@ seastar::future<struct stat> SeaStore::Shard::stat( Transaction::src_t::READ, "stat", op_type_t::STAT, + op_flags, [this, oid](auto &t, auto &onode) { return _stat(t, onode, oid); }).handle_error( @@ -1364,9 +1377,10 @@ seastar::future<struct stat> SeaStore::Shard::stat( SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist> SeaStore::Shard::omap_get_header( CollectionRef ch, - const ghobject_t& oid) + const ghobject_t& oid, + uint32_t op_flags) { - return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY); + return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY, op_flags); } SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t> @@ -1389,7 +1403,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t> SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const omap_keys_t &keys) + const omap_keys_t &keys, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1400,6 +1415,7 @@ SeaStore::Shard::omap_get_values( Transaction::src_t::READ, "omap_get_values", op_type_t::OMAP_GET_VALUES, + op_flags, [this, keys](auto &t, auto &onode) { return do_omap_get_values(t, onode, keys); }).finally([this] { @@ -1529,7 +1545,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t> SeaStore::Shard::omap_get_values( CollectionRef ch, const ghobject_t &oid, - const std::optional<std::string> &start) + const std::optional<std::string> &start, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1540,6 +1557,7 @@ SeaStore::Shard::omap_get_values( Transaction::src_t::READ, "omap_get_values2", op_type_t::OMAP_GET_VALUES2, + op_flags, [this, start](auto &t, auto &onode) { return do_omap_get_values(t, onode, start); }).finally([this] { @@ -1589,7 +1607,8 @@ SeaStore::Shard::fiemap( CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { ++(shard_stats.read_num); ++(shard_stats.pending_read_num); @@ -1600,6 +1619,7 @@ SeaStore::Shard::fiemap( Transaction::src_t::READ, "fiemap", op_type_t::READ, + op_flags, [this, off, len](auto &t, auto &onode) { return _fiemap(t, onode, off, len); }).finally([this] { @@ -1640,7 +1660,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks( [this, num_bytes](auto &ctx) { LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks); return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) { - DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...", + DEBUGT("cid={}, {} operations, 0x{:x} bytes, {} colls, {} objects ...", t, ctx.ch->get_cid(), ctx.ext_transaction.get_num_ops(), num_bytes, @@ -2677,6 +2697,7 @@ seastar::future<> SeaStore::Shard::write_meta( return transaction_manager->with_transaction_intr( Transaction::src_t::MUTATE, "write_meta", + CACHE_HINT_NOCACHE, [this, &key, &value](auto& t) { LOG_PREFIX(SeaStoreS::write_meta); @@ -2721,6 +2742,13 @@ SeaStore::read_meta(const std::string& key) ); } +seastar::future<std::string> SeaStore::get_default_device_class() +{ + using crimson::common::get_conf; + std::string type = get_conf<std::string>("seastore_main_device_type"); + return seastar::make_ready_future<std::string>(type); +} + uuid_d SeaStore::Shard::get_fsid() const { return device->get_meta().seastore_id; diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h index 185072744f2..e2a993b9e20 100644 --- a/src/crimson/os/seastore/seastore.h +++ b/src/crimson/os/seastore/seastore.h @@ -101,7 +101,8 @@ public: seastar::future<struct stat> stat( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<ceph::bufferlist> read( CollectionRef c, @@ -118,32 +119,38 @@ public: base_errorator::future<bool> exists( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; get_attr_errorator::future<ceph::bufferlist> get_attr( CollectionRef c, const ghobject_t& oid, - std::string_view name) const final; + std::string_view name, + uint32_t op_flags = 0) const final; get_attrs_ertr::future<attrs_t> get_attrs( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; read_errorator::future<omap_values_t> omap_get_values( CollectionRef c, const ghobject_t& oid, - const omap_keys_t& keys) final; + const omap_keys_t& keys, + uint32_t op_flags = 0) final; /// Retrieves paged set of values > start (if present) read_errorator::future<omap_values_paged_t> omap_get_values( CollectionRef c, ///< [in] collection const ghobject_t &oid, ///< [in] oid - const std::optional<std::string> &start ///< [in] start, empty for begin + const std::optional<std::string> &start, ///< [in] start, empty for begin + uint32_t op_flags = 0 ) final; ///< @return <done, values> values.empty() iff done get_attr_errorator::future<bufferlist> omap_get_header( CollectionRef c, - const ghobject_t& oid) final; + const ghobject_t& oid, + uint32_t op_flags = 0) final; /// std::get<1>(ret) returns end if and only if the listing has listed all /// the items within the range, otherwise it returns the next key to be listed. @@ -151,7 +158,8 @@ public: CollectionRef c, const ghobject_t& start, const ghobject_t& end, - uint64_t limit) const final; + uint64_t limit, + uint32_t op_flags = 0) const final; seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final; seastar::future<CollectionRef> open_collection(const coll_t& cid) final; @@ -170,7 +178,8 @@ public: CollectionRef ch, const ghobject_t& oid, uint64_t off, - uint64_t len) final; + uint64_t len, + uint32_t op_flags = 0) final; unsigned get_max_attr_name_length() const final { return 256; @@ -191,6 +200,8 @@ public: seastar::future<> write_meta(const std::string& key, const std::string& value); + seastar::future<std::string> get_default_device_class(); + store_statfs_t stat() const; uuid_d get_fsid() const; @@ -249,7 +260,8 @@ public: return seastar::do_with( internal_context_t( ch, std::move(t), - transaction_manager->create_transaction(src, tname)), + transaction_manager->create_transaction( + src, tname, t.get_fadvise_flags())), std::forward<F>(f), [this, op_type](auto &ctx, auto &f) { assert(shard_stats.starting_io_num); @@ -296,20 +308,22 @@ public: Transaction::src_t src, const char* tname, op_type_t op_type, + cache_hint_t cache_hint_flags, F &&f) const { auto begin_time = std::chrono::steady_clock::now(); return seastar::do_with( oid, Ret{}, std::forward<F>(f), - [this, ch, src, op_type, begin_time, tname + [this, ch, src, op_type, begin_time, tname, cache_hint_flags ](auto &oid, auto &ret, auto &f) { - return repeat_eagain([&, this, ch, src, tname] { + return repeat_eagain([&, this, ch, src, tname, cache_hint_flags] { assert(src == Transaction::src_t::READ); ++(shard_stats.repeat_read_num); return transaction_manager->with_transaction_intr( src, tname, + cache_hint_flags, [&, this, ch, tname](auto& t) { LOG_PREFIX(SeaStoreS::repeat_with_onode); @@ -567,6 +581,8 @@ public: seastar::future<std::vector<coll_core_t>> list_collections() final; + seastar::future<std::string> get_default_device_class() final; + FuturizedStore::Shard& get_sharded_store() final { return shard_stores.local(); } diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index f379dd0117c..a57f56d4ab4 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t) return out << "LADDR_LEAF"; case extent_types_t::ONODE_BLOCK_STAGED: return out << "ONODE_BLOCK_STAGED"; + case extent_types_t::ROOT_META: + return out << "ROOT_META"; case extent_types_t::OMAP_INNER: return out << "OMAP_INNER"; case extent_types_t::OMAP_LEAF: @@ -349,11 +351,11 @@ std::ostream &operator<<(std::ostream &out, const delta_info_t &delta) << "type: " << delta.type << ", paddr: " << delta.paddr << ", laddr: " << delta.laddr - << ", prev_crc: " << delta.prev_crc - << ", final_crc: " << delta.final_crc - << ", length: " << delta.length + << ", prev_crc: 0x" << std::hex << delta.prev_crc + << ", final_crc: 0x" << delta.final_crc + << ", length: 0x" << delta.length << std::dec << ", pversion: " << delta.pversion - << ", ext_seq: " << delta.ext_seq + << ", ext_seq: " << segment_seq_printer_t{delta.ext_seq} << ", seg_type: " << delta.seg_type << ")"; } @@ -371,7 +373,7 @@ std::ostream &operator<<(std::ostream &out, const extent_info_t &info) return out << "extent_info_t(" << "type: " << info.type << ", addr: " << info.addr - << ", len: " << info.len + << ", len: 0x" << std::hex << info.len << std::dec << ")"; } @@ -385,7 +387,7 @@ std::ostream &operator<<(std::ostream &out, const segment_header_t &header) << " " << rewrite_gen_printer_t{header.generation} << ", dirty_tail=" << header.dirty_tail << ", alloc_tail=" << header.alloc_tail - << ", segment_nonce=" << header.segment_nonce + << ", segment_nonce=0x" << std::hex << header.segment_nonce << std::dec << ", modify_time=" << mod_time_point_printer_t{header.modify_time} << ")"; } @@ -396,7 +398,7 @@ std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail) << tail.physical_segment_id << " " << tail.type << " " << segment_seq_printer_t{tail.segment_seq} - << ", segment_nonce=" << tail.segment_nonce + << ", segment_nonce=0x" << std::hex << tail.segment_nonce << std::dec << ", modify_time=" << mod_time_point_printer_t{tail.modify_time} << ", num_extents=" << tail.num_extents << ")"; @@ -462,8 +464,8 @@ std::ostream &operator<<(std::ostream& out, const record_size_t& rsize) { return out << "record_size_t(" << "record_type=" << rsize.record_type - << "raw_md=" << rsize.get_raw_mdlength() - << ", data=" << rsize.dlength + << "raw_md=0x" << std::hex << rsize.get_raw_mdlength() + << ", data=0x" << rsize.dlength << std::dec << ")"; } @@ -507,11 +509,11 @@ std::ostream& operator<<(std::ostream& out, const record_group_header_t& h) { return out << "record_group_header_t(" << "num_records=" << h.records - << ", mdlength=" << h.mdlength - << ", dlength=" << h.dlength - << ", nonce=" << h.segment_nonce + << ", mdlength=0x" << std::hex << h.mdlength + << ", dlength=0x" << h.dlength + << ", segment_nonce=0x" << h.segment_nonce << std::dec << ", committed_to=" << h.committed_to - << ", data_crc=" << h.data_crc + << ", data_crc=0x" << std::hex << h.data_crc << std::dec << ")"; } @@ -554,9 +556,9 @@ std::ostream& operator<<(std::ostream& out, const record_group_size_t& size) { return out << "record_group_size_t(" << "record_type=" << size.record_type - << "raw_md=" << size.get_raw_mdlength() - << ", data=" << size.dlength - << ", block_size=" << size.block_size + << "raw_md=0x" << std::hex << size.get_raw_mdlength() + << ", data=0x" << size.dlength + << ", block_size=0x" << size.block_size << std::dec << ", fullness=" << size.get_fullness() << ")"; } @@ -911,7 +913,7 @@ std::ostream& operator<<(std::ostream& out, const write_result_t& w) { return out << "write_result_t(" << "start=" << w.start_seq - << ", length=" << w.length + << ", length=0x" << std::hex << w.length << std::dec << ")"; } diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index df5c184e7ab..5930469ca07 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -3,6 +3,7 @@ #pragma once +#include <deque> #include <limits> #include <numeric> #include <optional> @@ -14,13 +15,47 @@ #include "include/byteorder.h" #include "include/denc.h" +#include "include/encoding.h" #include "include/buffer.h" #include "include/intarith.h" #include "include/interval_set.h" #include "include/uuid.h" +#include "include/rados.h" namespace crimson::os::seastore { +class cache_hint_t { + enum hint_t { + TOUCH, + NOCACHE + }; +public: + static constexpr cache_hint_t get_touch() { + return hint_t::TOUCH; + } + static constexpr cache_hint_t get_nocache() { + return hint_t::NOCACHE; + } + cache_hint_t(uint32_t flags) { + if (unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) || + unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) { + hint = NOCACHE; + } + } + bool operator==(const cache_hint_t &other) const { + return hint == other.hint; + } + bool operator!=(const cache_hint_t &other) const { + return hint != other.hint; + } +private: + constexpr cache_hint_t(hint_t hint) : hint(hint) {} + hint_t hint = hint_t::TOUCH; +}; + +inline constexpr cache_hint_t CACHE_HINT_TOUCH = cache_hint_t::get_touch(); +inline constexpr cache_hint_t CACHE_HINT_NOCACHE = cache_hint_t::get_nocache(); + /* using a special xattr key "omap_header" to store omap header */ const std::string OMAP_HEADER_XATTR_KEY = "omap_header"; @@ -1226,7 +1261,6 @@ constexpr laddr_t L_ADDR_MAX = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX); constexpr laddr_t L_ADDR_MIN = laddr_t::from_raw_uint(0); constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX; constexpr laddr_t L_ADDR_ROOT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 1); -constexpr laddr_t L_ADDR_LBAT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 2); struct __attribute__((packed)) laddr_le_t { ceph_le64 laddr; @@ -1378,23 +1412,24 @@ enum class extent_types_t : uint8_t { LADDR_INTERNAL = 1, LADDR_LEAF = 2, DINK_LADDR_LEAF = 3, // should only be used for unitttests - OMAP_INNER = 4, - OMAP_LEAF = 5, - ONODE_BLOCK_STAGED = 6, - COLL_BLOCK = 7, - OBJECT_DATA_BLOCK = 8, - RETIRED_PLACEHOLDER = 9, + ROOT_META = 4, + OMAP_INNER = 5, + OMAP_LEAF = 6, + ONODE_BLOCK_STAGED = 7, + COLL_BLOCK = 8, + OBJECT_DATA_BLOCK = 9, + RETIRED_PLACEHOLDER = 10, // the following two types are not extent types, // they are just used to indicates paddr allocation deltas - ALLOC_INFO = 10, - JOURNAL_TAIL = 11, + ALLOC_INFO = 11, + JOURNAL_TAIL = 12, // Test Block Types - TEST_BLOCK = 12, - TEST_BLOCK_PHYSICAL = 13, - BACKREF_INTERNAL = 14, - BACKREF_LEAF = 15, + TEST_BLOCK = 13, + TEST_BLOCK_PHYSICAL = 14, + BACKREF_INTERNAL = 15, + BACKREF_LEAF = 16, // None and the number of valid extent_types_t - NONE = 16, + NONE = 17, }; using extent_types_le_t = uint8_t; constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE); @@ -1409,12 +1444,12 @@ constexpr bool is_data_type(extent_types_t type) { } constexpr bool is_logical_metadata_type(extent_types_t type) { - return type >= extent_types_t::OMAP_INNER && + return type >= extent_types_t::ROOT_META && type <= extent_types_t::COLL_BLOCK; } constexpr bool is_logical_type(extent_types_t type) { - if ((type >= extent_types_t::OMAP_INNER && + if ((type >= extent_types_t::ROOT_META && type <= extent_types_t::OBJECT_DATA_BLOCK) || type == extent_types_t::TEST_BLOCK) { assert(is_logical_metadata_type(type) || @@ -1466,6 +1501,23 @@ constexpr bool is_physical_type(extent_types_t type) { } } +constexpr bool is_backref_mapped_type(extent_types_t type) { + if ((type >= extent_types_t::LADDR_INTERNAL && + type <= extent_types_t::OBJECT_DATA_BLOCK) || + type == extent_types_t::TEST_BLOCK || + type == extent_types_t::TEST_BLOCK_PHYSICAL) { + assert(is_logical_type(type) || + is_lba_node(type) || + type == extent_types_t::TEST_BLOCK_PHYSICAL); + return true; + } else { + assert(!is_logical_type(type) && + !is_lba_node(type) && + type != extent_types_t::TEST_BLOCK_PHYSICAL); + return false; + } +} + constexpr bool is_real_type(extent_types_t type) { if (type <= extent_types_t::OBJECT_DATA_BLOCK || (type >= extent_types_t::TEST_BLOCK && @@ -1617,8 +1669,8 @@ struct delta_info_t { extent_types_t type = extent_types_t::NONE; ///< delta type paddr_t paddr; ///< physical address laddr_t laddr = L_ADDR_NULL; ///< logical address - uint32_t prev_crc = 0; - uint32_t final_crc = 0; + checksum_t prev_crc = 0; + checksum_t final_crc = 0; extent_len_t length = 0; ///< extent length extent_version_t pversion; ///< prior version segment_seq_t ext_seq; ///< seq of the extent's segment @@ -1926,54 +1978,29 @@ using backref_root_t = phy_tree_root_t; * TODO: generalize this to permit more than one lba_manager implementation */ struct __attribute__((packed)) root_t { - using meta_t = std::map<std::string, std::string>; - - static constexpr int MAX_META_LENGTH = 1024; - backref_root_t backref_root; lba_root_t lba_root; laddr_le_t onode_root; coll_root_le_t collection_root; + laddr_le_t meta; - char meta[MAX_META_LENGTH]; - - root_t() { - set_meta(meta_t{}); - } + root_t() = default; void adjust_addrs_from_base(paddr_t base) { lba_root.adjust_addrs_from_base(base); backref_root.adjust_addrs_from_base(base); } - - meta_t get_meta() { - bufferlist bl; - bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta)); - meta_t ret; - auto iter = bl.cbegin(); - decode(ret, iter); - return ret; - } - - void set_meta(const meta_t &m) { - ceph::bufferlist bl; - encode(m, bl); - ceph_assert(bl.length() < MAX_META_LENGTH); - bl.rebuild(); - auto &bptr = bl.front(); - ::memset(meta, 0, MAX_META_LENGTH); - ::memcpy(meta, bptr.c_str(), bl.length()); - } }; struct alloc_blk_t { alloc_blk_t( - paddr_t paddr, - laddr_t laddr, + const paddr_t& paddr, + const laddr_t& laddr, extent_len_t len, extent_types_t type) - : paddr(paddr), laddr(laddr), len(len), type(type) - {} + : paddr(paddr), laddr(laddr), len(len), type(type) { + assert(len > 0); + } explicit alloc_blk_t() = default; @@ -1989,6 +2016,25 @@ struct alloc_blk_t { denc(v.type, p); DENC_FINISH(p); } + + static alloc_blk_t create_alloc( + const paddr_t& paddr, + const laddr_t& laddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type)); + assert(laddr != L_ADDR_NULL); + return alloc_blk_t(paddr, laddr, len, type); + } + + static alloc_blk_t create_retire( + const paddr_t& paddr, + extent_len_t len, + extent_types_t type) { + assert(is_backref_mapped_type(type) || + is_retired_placeholder_type(type)); + return alloc_blk_t(paddr, L_ADDR_NULL, len, type); + } }; // use absolute address diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc index 1be9cce5f6b..3eced41081e 100644 --- a/src/crimson/os/seastore/segment_manager.cc +++ b/src/crimson/os/seastore/segment_manager.cc @@ -16,10 +16,10 @@ namespace crimson::os::seastore { std::ostream& operator<<(std::ostream& out, const block_shard_info_t& sf) { out << "(" - << "size=" << sf.size - << ", segments=" <<sf.segments - << ", tracker_offset=" <<sf.tracker_offset - << ", first_segment_offset=" <<sf.first_segment_offset + << "size=0x" << std::hex << sf.size << std::dec + << ", segments=" << sf.segments + << ", tracker_offset=0x" << std::hex << sf.tracker_offset + << ", first_segment_offset=0x" << sf.first_segment_offset << std::dec <<")"; return out; } @@ -28,8 +28,8 @@ std::ostream& operator<<(std::ostream& out, const block_sm_superblock_t& sb) { out << "superblock(" << "shard_num=" << sb.shard_num - << ", segment_size=" << sb.segment_size - << ", block_size=" << sb.block_size + << ", segment_size=0x" << std::hex << sb.segment_size + << ", block_size=0x" << sb.block_size << std::dec << ", shard_info:"; for (auto &sf : sb.shard_infos) { out << sf diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc index 0500271f81a..7077aad7407 100644 --- a/src/crimson/os/seastore/segment_manager/block.cc +++ b/src/crimson/os/seastore/segment_manager/block.cc @@ -60,7 +60,7 @@ static write_ertr::future<> do_write( { LOG_PREFIX(block_do_write); auto len = bptr.length(); - TRACE("{} poffset={}~{} ...", + TRACE("{} poffset=0x{:x}~0x{:x} ...", device_id_printer_t{device_id}, offset, len); return device.dma_write( offset, @@ -68,16 +68,16 @@ static write_ertr::future<> do_write( len ).handle_exception( [FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> { - ERROR("{} poffset={}~{} got error -- {}", + ERROR("{} poffset=0x{:x}~0x{:x} got error -- {}", device_id_printer_t{device_id}, offset, len, e); return crimson::ct_error::input_output_error::make(); }).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> { if (result != len) { - ERROR("{} poffset={}~{} write len={} inconsistent", + ERROR("{} poffset=0x{:x}~0x{:x} write len=0x{:x} inconsistent", device_id_printer_t{device_id}, offset, len, result); return crimson::ct_error::input_output_error::make(); } - TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len); + TRACE("{} poffset=0x{:x}~0x{:x} done", device_id_printer_t{device_id}, offset, len); return write_ertr::now(); }); } @@ -90,7 +90,7 @@ static write_ertr::future<> do_writev( size_t block_size) { LOG_PREFIX(block_do_writev); - TRACE("{} poffset={}~{}, {} buffers", + TRACE("{} poffset=0x{:x}~0x{:x}, {} buffers", device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers()); // writev requires each buffer to be aligned to the disks' block @@ -109,22 +109,22 @@ static write_ertr::future<> do_writev( auto off = offset + p.offset; auto len = p.length; auto& iov = p.iov; - TRACE("{} poffset={}~{} dma_write ...", + TRACE("{} poffset=0x{:x}~0x{:x} dma_write ...", device_id_printer_t{device_id}, off, len); return device.dma_write(off, std::move(iov) ).handle_exception( [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t> { - ERROR("{} poffset={}~{} dma_write got error -- {}", + ERROR("{} poffset=0x{:x}~0x{:x} dma_write got error -- {}", device_id_printer_t{device_id}, off, len, e); return crimson::ct_error::input_output_error::make(); }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> { if (written != len) { - ERROR("{} poffset={}~{} dma_write len={} inconsistent", + ERROR("{} poffset=0x{:x}~0x{:x} dma_write len=0x{:x} inconsistent", device_id_printer_t{device_id}, off, len, written); return crimson::ct_error::input_output_error::make(); } - TRACE("{} poffset={}~{} dma_write done", + TRACE("{} poffset=0x{:x}~0x{:x} dma_write done", device_id_printer_t{device_id}, off, len); return write_ertr::now(); }); @@ -140,7 +140,7 @@ static read_ertr::future<> do_read( bufferptr &bptr) { LOG_PREFIX(block_do_read); - TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len); + TRACE("{} poffset=0x{:x}~0x{:x} ...", device_id_printer_t{device_id}, offset, len); assert(len <= bptr.length()); return device.dma_read( offset, @@ -153,16 +153,16 @@ static read_ertr::future<> do_read( // once seastar::future<T>::handle_exception() returns seastar::futurize_t<T> [FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t> { - ERROR("{} poffset={}~{} got error -- {}", + ERROR("{} poffset=0x{:x}~0x{:x} got error -- {}", device_id_printer_t{device_id}, offset, len, e); return crimson::ct_error::input_output_error::make(); }).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> { if (result != len) { - ERROR("{} poffset={}~{} read len={} inconsistent", + ERROR("{} poffset=0x{:x}~0x{:x} read len=0x{:x} inconsistent", device_id_printer_t{device_id}, offset, len, result); return crimson::ct_error::input_output_error::make(); } - TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len); + TRACE("{} poffset=0x{:x}~0x{:x} done", device_id_printer_t{device_id}, offset, len); return read_ertr::now(); }); } @@ -174,7 +174,7 @@ SegmentStateTracker::write_out( uint64_t offset) { LOG_PREFIX(SegmentStateTracker::write_out); - DEBUG("{} poffset={}~{}", + DEBUG("{} poffset=0x{:x}~0x{:x}", device_id_printer_t{device_id}, offset, bptr.length()); return do_write(device_id, device, offset, bptr); } @@ -186,7 +186,7 @@ SegmentStateTracker::read_in( uint64_t offset) { LOG_PREFIX(SegmentStateTracker::read_in); - DEBUG("{} poffset={}~{}", + DEBUG("{} poffset=0x{:x}~0x{:x}", device_id_printer_t{device_id}, offset, bptr.length()); return do_read( device_id, @@ -230,7 +230,7 @@ block_sm_superblock_t make_superblock( + i * segments_per_shard * config_segment_size; } - INFO("{} disk_size={}, segment_size={}, block_size={}", + INFO("{} disk_size=0x{:x}, segment_size=0x{:x}, block_size=0x{:x}", device_id_printer_t{device_id}, size, uint64_t(config_segment_size), @@ -255,7 +255,7 @@ static check_create_device_ret check_create_device( size_t size) { LOG_PREFIX(block_check_create_device); - INFO("path={}, size={}", path, size); + INFO("path={}, size=0x{:x}", path, size); return seastar::open_file_dma( path, seastar::open_flags::exclusive | @@ -266,7 +266,7 @@ static check_create_device_ret check_create_device( file, [size, FNAME, &path](auto &f) -> seastar::future<> { - DEBUG("path={} created, truncating to {}", path, size); + DEBUG("path={} created, truncating to 0x{:x}", path, size); ceph_assert(f); return f.truncate( size @@ -318,8 +318,8 @@ open_device_ret open_device( ).then([stat, &path, FNAME](auto file) mutable { return file.size().then([stat, file, &path, FNAME](auto size) mutable { stat.size = size; - INFO("path={} successful, size={}, block_size={}", - path, stat.size, stat.block_size); + INFO("path={} successful, size=0x{:x}, block_size=0x{:x}", + path, stat.size, stat.block_size); return std::make_pair(file, stat); }); }); @@ -410,19 +410,19 @@ Segment::write_ertr::future<> BlockSegment::write( { LOG_PREFIX(BlockSegment::write); auto paddr = paddr_t::make_seg_paddr(id, offset); - DEBUG("{} offset={}~{} poffset={} ...", + DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} ...", id, offset, bl.length(), manager.get_offset(paddr)); if (offset < write_pointer || offset % manager.superblock.block_size != 0 || bl.length() % manager.superblock.block_size != 0) { - ERROR("{} offset={}~{} poffset={} invalid write", + ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid write", id, offset, bl.length(), manager.get_offset(paddr)); return crimson::ct_error::invarg::make(); } if (offset + bl.length() > manager.superblock.segment_size) { - ERROR("{} offset={}~{} poffset={} write out of the range {}", + ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} write out of the range 0x{:x}", id, offset, bl.length(), manager.get_offset(paddr), manager.superblock.segment_size); return crimson::ct_error::enospc::make(); @@ -443,7 +443,7 @@ Segment::close_ertr::future<> BlockSegmentManager::segment_close( LOG_PREFIX(BlockSegmentManager::segment_close); auto s_id = id.device_segment_id(); int unused_bytes = get_segment_size() - write_pointer; - INFO("{} unused_bytes={} ...", id, unused_bytes); + INFO("{} unused_bytes=0x{:x} ...", id, unused_bytes); assert(unused_bytes >= 0); assert(id.device_id() == get_device_id()); @@ -693,24 +693,24 @@ SegmentManager::read_ertr::future<> BlockSegmentManager::read( auto s_id = id.device_segment_id(); auto s_off = seg_addr.get_segment_off(); auto p_off = get_offset(addr); - DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off); + DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} ...", id, s_off, len, p_off); assert(addr.get_device_id() == get_device_id()); if (s_off % superblock.block_size != 0 || len % superblock.block_size != 0) { - ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off); + ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid read", id, s_off, len, p_off); return crimson::ct_error::invarg::make(); } if (s_id >= get_num_segments()) { - ERROR("{} offset={}~{} poffset={} segment-id out of range {}", + ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} segment-id out of range {}", id, s_off, len, p_off, get_num_segments()); return crimson::ct_error::invarg::make(); } if (s_off + len > superblock.segment_size) { - ERROR("{} offset={}~{} poffset={} read out of range {}", + ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} read out of range 0x{:x}", id, s_off, len, p_off, superblock.segment_size); return crimson::ct_error::invarg::make(); } @@ -718,7 +718,7 @@ SegmentManager::read_ertr::future<> BlockSegmentManager::read( if (tracker->get(s_id) == segment_state_t::EMPTY) { // XXX: not an error during scanning, // might need refactor to increase the log level - DEBUG("{} offset={}~{} poffset={} invalid state {}", + DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid state {}", id, s_off, len, p_off, tracker->get(s_id)); return crimson::ct_error::enoent::make(); } diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc index 4a4873afb94..bdd97e88733 100644 --- a/src/crimson/os/seastore/segment_manager/ephemeral.cc +++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc @@ -20,8 +20,11 @@ namespace { namespace crimson::os::seastore::segment_manager { std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) { - return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size - << ", segment_size=" << c.segment_size << ")"; + return lhs << "ephemeral_config_t(size=0x" + << std::hex << c.size + << ", block_size=0x" << c.block_size + << ", segment_size=0x" << c.segment_size + << std::dec << ")"; } EphemeralSegmentManagerRef create_test_ephemeral() { @@ -141,7 +144,8 @@ Segment::write_ertr::future<> EphemeralSegmentManager::segment_write( { auto& seg_addr = addr.as_seg_paddr(); logger().debug( - "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}", + "segment_write to segment {} at offset 0x{:x}, " + "physical offset 0x{:x}, len 0x{:x}, crc 0x{:x}", seg_addr.get_segment_id(), seg_addr.get_segment_off(), get_offset(addr), @@ -268,7 +272,7 @@ SegmentManager::read_ertr::future<> EphemeralSegmentManager::read( if (seg_addr.get_segment_off() + len > config.segment_size) { logger().error( - "EphemeralSegmentManager::read: invalid offset {}~{}!", + "EphemeralSegmentManager::read: invalid offset {}~0x{:x}!", addr, len); return crimson::ct_error::invarg::make(); @@ -279,7 +283,8 @@ SegmentManager::read_ertr::future<> EphemeralSegmentManager::read( bufferlist bl; bl.push_back(out); logger().debug( - "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}", + "segment_read to segment {} at offset 0x{:x}, " + "physical offset 0x{:x}, length 0x{:x}, crc 0x{:x}", seg_addr.get_segment_id().device_segment_id(), seg_addr.get_segment_off(), get_offset(addr), diff --git a/src/crimson/os/seastore/segment_manager/zbd.cc b/src/crimson/os/seastore/segment_manager/zbd.cc index 88521a947f8..22efbed5940 100644 --- a/src/crimson/os/seastore/segment_manager/zbd.cc +++ b/src/crimson/os/seastore/segment_manager/zbd.cc @@ -56,7 +56,7 @@ static open_device_ret open_device( path, seastar::follow_symlink::yes ).then([FNAME, mode, &path](auto stat) mutable { return seastar::open_file_dma(path, mode).then([=](auto file) { - DEBUG("open of device {} successful, size {}", + DEBUG("open of device {} successful, size 0x{:x}", path, stat.size); return std::make_pair(file, stat); @@ -100,11 +100,12 @@ static zbd_sm_metadata_t make_metadata( WARN("Ignoring configuration values for device and segment size"); INFO( - "device size: {}, available size: {}, block size: {}, allocated size: {}," - " total zones {}, zone size: {}, zone capacity: {}," - " total segments: {}, zones per segment: {}, segment size: {}" + "device size: 0x{:x}, available size: 0x{:x}," + " block size: 0x{:x}, allocated size: 0x{:x}," + " total zones {}, zone size: 0x{:x}, zone capacity: 0x{:x}," + " total segments: {}, zones per segment: {}, segment size: 0x{:x}" " conv zones: {}, swr zones: {}, per shard segments: {}" - " per shard available size: {}", + " per shard available size: 0x{:x}", total_size, available_size, data.block_size, @@ -126,8 +127,8 @@ static zbd_sm_metadata_t make_metadata( shard_infos[i].segments = per_shard_segments; shard_infos[i].first_segment_offset = zone_size * skipped_zones + i * segment_size * per_shard_segments; - INFO("First segment offset for shard {} is: {}", - i, shard_infos[i].first_segment_offset); + INFO("First segment offset for shard {} is: 0x{:x}", + i, shard_infos[i].first_segment_offset); } zbd_sm_metadata_t ret = zbd_sm_metadata_t{ @@ -248,7 +249,7 @@ static write_ertr::future<> do_write( bufferptr &bptr) { LOG_PREFIX(ZBDSegmentManager::do_write); - DEBUG("offset {} len {}", + DEBUG("offset 0x{:x} len 0x{:x}", offset, bptr.length()); return device.dma_write( @@ -277,7 +278,7 @@ static write_ertr::future<> do_writev( size_t block_size) { LOG_PREFIX(ZBDSegmentManager::do_writev); - DEBUG("{} offset {} len {}", + DEBUG("{} offset 0x{:x} len 0x{:x}", device_id_printer_t{device_id}, offset, bl.length()); // writev requires each buffer to be aligned to the disks' block // size, we need to rebuild here @@ -295,23 +296,23 @@ static write_ertr::future<> do_writev( auto off = offset + p.offset; auto len = p.length; auto& iov = p.iov; - DEBUG("{} poffset={}~{} dma_write ...", + DEBUG("{} poffset=0x{:x}~0x{:x} dma_write ...", device_id_printer_t{device_id}, off, len); return device.dma_write(off, std::move(iov) ).handle_exception( [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t> { - ERROR("{} poffset={}~{} dma_write got error -- {}", + ERROR("{} poffset=0x{:x}~0x{:x} dma_write got error -- {}", device_id_printer_t{device_id}, off, len, e); return crimson::ct_error::input_output_error::make(); }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> { if (written != len) { - ERROR("{} poffset={}~{} dma_write len={} inconsistent", + ERROR("{} poffset=0x{:x}~0x{:x} dma_write len=0x{:x} inconsistent", device_id_printer_t{device_id}, off, len, written); return crimson::ct_error::input_output_error::make(); } - DEBUG("{} poffset={}~{} dma_write done", + DEBUG("{} poffset=0x{:x}~0x{:x} dma_write done", device_id_printer_t{device_id}, off, len); return write_ertr::now(); @@ -329,12 +330,12 @@ write_metadata(seastar::file &device, zbd_sm_metadata_t sb) bufferptr(ceph::buffer::create_page_aligned(sb.block_size)), [=, &device](auto &bp) { LOG_PREFIX(ZBDSegmentManager::write_metadata); - DEBUG("block_size {}", sb.block_size); + DEBUG("block_size 0x{:x}", sb.block_size); bufferlist bl; encode(sb, bl); auto iter = bl.begin(); assert(bl.length() < sb.block_size); - DEBUG("buffer length {}", bl.length()); + DEBUG("buffer length 0x{:x}", bl.length()); iter.copy(bl.length(), bp.c_str()); DEBUG("doing writeout"); return do_write(device, 0, bp); @@ -349,7 +350,7 @@ static read_ertr::future<> do_read( { LOG_PREFIX(ZBDSegmentManager::do_read); assert(len <= bptr.length()); - DEBUG("offset {} len {}", + DEBUG("offset 0x{:x} len 0x{:x}", offset, len); return device.dma_read( @@ -659,7 +660,7 @@ SegmentManager::read_ertr::future<> ZBDSegmentManager::read( } if (seg_addr.get_segment_off() + len > metadata.segment_capacity) { - ERROR("invalid read offset {}, len {}", + ERROR("invalid read offset {}, len 0x{:x}", addr, len); return crimson::ct_error::invarg::make(); @@ -703,7 +704,7 @@ Segment::write_ertr::future<> ZBDSegmentManager::segment_write( assert(addr.get_device_id() == get_device_id()); assert((bl.length() % metadata.block_size) == 0); auto& seg_addr = addr.as_seg_paddr(); - DEBUG("write to segment {} at offset {}, physical offset {}, len {}", + DEBUG("write to segment {} at offset 0x{:x}, physical offset 0x{:x}, len 0x{:x}", seg_addr.get_segment_id(), seg_addr.get_segment_off(), get_offset(addr), @@ -756,7 +757,7 @@ Segment::write_ertr::future<> ZBDSegment::write( LOG_PREFIX(ZBDSegment::write); if (offset != write_pointer || offset % manager.metadata.block_size != 0) { ERROR("Segment offset and zone write pointer mismatch. " - "segment {} segment-offset {} write pointer {}", + "segment {} segment-offset 0x{:x} write pointer 0x{:x}", id, offset, write_pointer); return crimson::ct_error::invarg::make(); } @@ -772,7 +773,7 @@ Segment::write_ertr::future<> ZBDSegment::write_padding_bytes( size_t padding_bytes) { LOG_PREFIX(ZBDSegment::write_padding_bytes); - DEBUG("Writing {} padding bytes to segment {} at wp {}", + DEBUG("Writing 0x{:x} padding bytes to segment {} at wp 0x{:x}", padding_bytes, id, write_pointer); return crimson::repeat([FNAME, padding_bytes, this] () mutable { @@ -804,7 +805,7 @@ Segment::write_ertr::future<> ZBDSegment::advance_wp( { LOG_PREFIX(ZBDSegment::advance_wp); - DEBUG("Advancing write pointer from {} to {}", write_pointer, offset); + DEBUG("Advancing write pointer from 0x{:x} to 0x{:x}", write_pointer, offset); if (offset < write_pointer) { return crimson::ct_error::invarg::make(); } diff --git a/src/crimson/os/seastore/segment_manager_group.cc b/src/crimson/os/seastore/segment_manager_group.cc index 332b794b70e..f4822c9a18c 100644 --- a/src/crimson/os/seastore/segment_manager_group.cc +++ b/src/crimson/os/seastore/segment_manager_group.cc @@ -26,13 +26,13 @@ SegmentManagerGroup::read_segment_tail(segment_id_t segment) } ).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_tail_ret { LOG_PREFIX(SegmentManagerGroup::read_segment_tail); - DEBUG("segment {} bptr size {}", segment, bptr.length()); + DEBUG("segment {} bptr size 0x{:x}", segment, bptr.length()); segment_tail_t tail; bufferlist bl; bl.push_back(bptr); - DEBUG("segment {} block crc {}", + DEBUG("segment {} block crc 0x{:x}", segment, bl.begin().crc32c(segment_manager.get_block_size(), 0)); @@ -66,13 +66,13 @@ SegmentManagerGroup::read_segment_header(segment_id_t segment) } ).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_header_ret { LOG_PREFIX(SegmentManagerGroup::read_segment_header); - DEBUG("segment {} bptr size {}", segment, bptr.length()); + DEBUG("segment {} bptr size 0x{:x}", segment, bptr.length()); segment_header_t header; bufferlist bl; bl.push_back(bptr); - DEBUG("segment {} block crc {}", + DEBUG("segment {} block crc 0x{:x}", segment, bl.begin().crc32c(segment_manager.get_block_size(), 0)); @@ -111,7 +111,7 @@ SegmentManagerGroup::read(paddr_t start, size_t len) LOG_PREFIX(SegmentManagerGroup::read); assert(has_device(start.get_device_id())); auto& segment_manager = *segment_managers[start.get_device_id()]; - TRACE("reading data {}~{}", start, len); + TRACE("reading data {}~0x{:x}", start, len); return segment_manager.read( start, len diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 5d8ad00ba22..cd8c333c69f 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -8,16 +8,17 @@ #include <boost/intrusive/list.hpp> #include "crimson/common/log.h" +#include "crimson/os/seastore/backref_entry.h" +#include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/logging.h" #include "crimson/os/seastore/ordering_handle.h" -#include "crimson/os/seastore/seastore_types.h" -#include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/seastore_types.h" +#include "crimson/os/seastore/transaction_interruptor.h" namespace crimson::os::seastore { class SeaStore; -class Transaction; struct io_stat_t { uint64_t num = 0; @@ -408,12 +409,14 @@ public: src_t src, journal_seq_t initiated_after, on_destruct_func_t&& f, - transaction_id_t trans_id + transaction_id_t trans_id, + cache_hint_t cache_hint ) : weak(weak), handle(std::move(handle)), on_destruct(std::move(f)), src(src), - trans_id(trans_id) + trans_id(trans_id), + cache_hint(cache_hint) {} void invalidate_clear_write_set() { @@ -460,6 +463,7 @@ public: ool_write_stats = {}; rewrite_stats = {}; conflicted = false; + assert(backref_entries.empty()); if (!has_reset) { has_reset = true; } @@ -571,10 +575,23 @@ public: return pre_alloc_list; } + cache_hint_t get_cache_hint() const { + return cache_hint; + } + private: friend class Cache; friend Ref make_test_transaction(); + void set_backref_entries(backref_entry_refs_t&& entries) { + assert(backref_entries.empty()); + backref_entries = std::move(entries); + } + + backref_entry_refs_t move_backref_entries() { + return std::move(backref_entries); + } + /** * If set, *this may not be used to perform writes and will not provide * consistentency allowing operations using to avoid maintaining a read_set. @@ -669,6 +686,10 @@ private: transaction_id_t trans_id = TRANS_ID_NULL; seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool; + + backref_entry_refs_t backref_entries; + + cache_hint_t cache_hint = CACHE_HINT_TOUCH; }; using TransactionRef = Transaction::Ref; @@ -681,67 +702,11 @@ inline TransactionRef make_test_transaction() { Transaction::src_t::MUTATE, JOURNAL_SEQ_NULL, [](Transaction&) {}, - ++next_id + ++next_id, + CACHE_HINT_TOUCH ); } -struct TransactionConflictCondition { - class transaction_conflict final : public std::exception { - public: - const char* what() const noexcept final { - return "transaction conflict detected"; - } - }; - -public: - TransactionConflictCondition(Transaction &t) : t(t) {} - - template <typename Fut> - std::optional<Fut> may_interrupt() { - if (t.conflicted) { - return seastar::futurize<Fut>::make_exception_future( - transaction_conflict()); - } else { - return std::optional<Fut>(); - } - } - - template <typename T> - static constexpr bool is_interruption_v = - std::is_same_v<T, transaction_conflict>; - - - static bool is_interruption(std::exception_ptr& eptr) { - return *eptr.__cxa_exception_type() == typeid(transaction_conflict); - } - -private: - Transaction &t; -}; - -using trans_intr = crimson::interruptible::interruptor< - TransactionConflictCondition - >; - -template <typename E> -using trans_iertr = - crimson::interruptible::interruptible_errorator< - TransactionConflictCondition, - E - >; - -template <typename F, typename... Args> -auto with_trans_intr(Transaction &t, F &&f, Args&&... args) { - return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>( - std::move(f), - TransactionConflictCondition(t), - t, - std::forward<Args>(args)...); -} - -template <typename T> -using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>; - } #if FMT_VERSION >= 90000 diff --git a/src/crimson/os/seastore/transaction_interruptor.cc b/src/crimson/os/seastore/transaction_interruptor.cc new file mode 100644 index 00000000000..d22f760f2db --- /dev/null +++ b/src/crimson/os/seastore/transaction_interruptor.cc @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/transaction_interruptor.h" + +#include "crimson/os/seastore/transaction.h" + +namespace crimson::os::seastore { + +bool TransactionConflictCondition::is_conflicted() const +{ + return t.conflicted; +} + +} diff --git a/src/crimson/os/seastore/transaction_interruptor.h b/src/crimson/os/seastore/transaction_interruptor.h new file mode 100644 index 00000000000..d0522c23c19 --- /dev/null +++ b/src/crimson/os/seastore/transaction_interruptor.h @@ -0,0 +1,77 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <exception> +#include <optional> +#include <type_traits> +#include <utility> + +#include "crimson/common/errorator.h" +#include "crimson/common/interruptible_future.h" + +namespace crimson::os::seastore { + +class Transaction; + +struct TransactionConflictCondition { + class transaction_conflict final : public std::exception { + public: + const char* what() const noexcept final { + return "transaction conflict detected"; + } + }; + +public: + TransactionConflictCondition(Transaction &t) : t(t) {} + + template <typename Fut> + std::optional<Fut> may_interrupt() { + if (is_conflicted()) { + return seastar::futurize<Fut>::make_exception_future( + transaction_conflict()); + } else { + return std::optional<Fut>(); + } + } + + template <typename T> + static constexpr bool is_interruption_v = + std::is_same_v<T, transaction_conflict>; + + + static bool is_interruption(std::exception_ptr& eptr) { + return *eptr.__cxa_exception_type() == typeid(transaction_conflict); + } + +private: + bool is_conflicted() const; + + Transaction &t; +}; + +using trans_intr = crimson::interruptible::interruptor< + TransactionConflictCondition + >; + +template <typename E> +using trans_iertr = + crimson::interruptible::interruptible_errorator< + TransactionConflictCondition, + E + >; + +template <typename F, typename... Args> +auto with_trans_intr(Transaction &t, F &&f, Args&&... args) { + return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>( + std::move(f), + TransactionConflictCondition(t), + t, + std::forward<Args>(args)...); +} + +template <typename T> +using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>; + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index f4e3b0858f2..807d88b2cbc 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -66,6 +66,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() return with_transaction_intr( Transaction::src_t::MUTATE, "mkfs_tm", + CACHE_HINT_TOUCH, [this, FNAME](auto& t) { cache->init(); @@ -74,6 +75,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs() return lba_manager->mkfs(t); }).si_then([this, &t] { return backref_manager->mkfs(t); + }).si_then([this, &t] { + return init_root_meta(t); }).si_then([this, FNAME, &t] { INFOT("submitting mkfs transaction", t); return submit_transaction_direct(t); @@ -129,6 +132,7 @@ TransactionManager::mount() journal->get_trimmer().set_journal_head(start_seq); return with_transaction_weak( "mount", + CACHE_HINT_TOUCH, [this](auto &t) { return cache->init_cached_extents(t, [this](auto &t, auto &e) { @@ -219,7 +223,7 @@ TransactionManager::ref_ret TransactionManager::inc_ref( TRACET("{}", t, offset); return lba_manager->incref_extent(t, offset ).si_then([FNAME, offset, &t](auto result) { - DEBUGT("extent refcount is incremented to {} -- {}~{}, {}", + DEBUGT("extent refcount is incremented to {} -- {}~0x{:x}, {}", t, result.refcount, offset, result.length, result.addr); return result.refcount; }); @@ -459,8 +463,12 @@ TransactionManager::do_submit_transaction( } SUBTRACET(seastore_t, "submitting record", tref); - return journal->submit_record(std::move(record), tref.get_handle() - ).safe_then([this, FNAME, &tref](auto submit_result) mutable { + return journal->submit_record( + std::move(record), + tref.get_handle(), + tref.get_src(), + [this, FNAME, &tref](record_locator_t submit_result) + { SUBDEBUGT(seastore_t, "committed with {}", tref, submit_result); auto start_seq = submit_result.write_result.start_seq; journal->get_trimmer().set_journal_head(start_seq); @@ -471,10 +479,8 @@ TransactionManager::do_submit_transaction( journal->get_trimmer().update_journal_tails( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); - return journal->finish_commit(tref.get_src() - ).then([&tref] { - return tref.get_handle().complete(); - }); + }).safe_then([&tref] { + return tref.get_handle().complete(); }).handle_error( submit_transaction_iertr::pass_further{}, crimson::ct_error::assert_all{"Hit error submitting to journal"} @@ -506,7 +512,7 @@ TransactionManager::get_next_dirty_extents( size_t max_bytes) { LOG_PREFIX(TransactionManager::get_next_dirty_extents); - DEBUGT("max_bytes={}B, seq={}", t, max_bytes, seq); + DEBUGT("max_bytes=0x{:x}B, seq={}", t, max_bytes, seq); return cache->get_next_dirty_extents(t, seq, max_bytes); } @@ -521,101 +527,111 @@ TransactionManager::rewrite_logical_extent( ceph_abort(); } - auto lextent = extent->cast<LogicalCachedExtent>(); - cache->retire_extent(t, extent); - if (get_extent_category(lextent->get_type()) == data_category_t::METADATA) { - auto nlextent = cache->alloc_new_extent_by_type( + if (get_extent_category(extent->get_type()) == data_category_t::METADATA) { + assert(extent->is_fully_loaded()); + cache->retire_extent(t, extent); + auto nextent = cache->alloc_new_extent_by_type( t, - lextent->get_type(), - lextent->get_length(), - lextent->get_user_hint(), + extent->get_type(), + extent->get_length(), + extent->get_user_hint(), // get target rewrite generation - lextent->get_rewrite_generation())->cast<LogicalCachedExtent>(); - nlextent->rewrite(t, *lextent, 0); + extent->get_rewrite_generation())->cast<LogicalCachedExtent>(); + nextent->rewrite(t, *extent, 0); - DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent); + DEBUGT("rewriting meta -- {} to {}", t, *extent, *nextent); #ifndef NDEBUG - if (get_checksum_needed(lextent->get_paddr())) { - assert(lextent->get_last_committed_crc() == lextent->calc_crc32c()); + if (get_checksum_needed(extent->get_paddr())) { + assert(extent->get_last_committed_crc() == extent->calc_crc32c()); } else { - assert(lextent->get_last_committed_crc() == CRC_NULL); + assert(extent->get_last_committed_crc() == CRC_NULL); } #endif - nlextent->set_last_committed_crc(lextent->get_last_committed_crc()); + nextent->set_last_committed_crc(extent->get_last_committed_crc()); /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc * extents since we're going to do it again once we either do the ool write * or allocate a relative inline addr. TODO: refactor AsyncCleaner to * avoid this complication. */ return lba_manager->update_mapping( t, - lextent->get_laddr(), - lextent->get_length(), - lextent->get_paddr(), - nlextent->get_length(), - nlextent->get_paddr(), - nlextent->get_last_committed_crc(), - nlextent.get()).discard_result(); + extent->get_laddr(), + extent->get_length(), + extent->get_paddr(), + nextent->get_length(), + nextent->get_paddr(), + nextent->get_last_committed_crc(), + nextent.get() + ).discard_result(); } else { - assert(get_extent_category(lextent->get_type()) == data_category_t::DATA); - auto extents = cache->alloc_new_data_extents_by_type( - t, - lextent->get_type(), - lextent->get_length(), - lextent->get_user_hint(), - // get target rewrite generation - lextent->get_rewrite_generation()); - return seastar::do_with( - std::move(extents), - 0, - lextent->get_length(), - extent_ref_count_t(0), - [this, FNAME, lextent, &t] - (auto &extents, auto &off, auto &left, auto &refcount) { - return trans_intr::do_for_each( - extents, - [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) { - bool first_extent = (off == 0); - ceph_assert(left >= nextent->get_length()); - auto nlextent = nextent->template cast<LogicalCachedExtent>(); - nlextent->rewrite(t, *lextent, off); - DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent); - - /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc - * extents since we're going to do it again once we either do the ool write - * or allocate a relative inline addr. TODO: refactor AsyncCleaner to - * avoid this complication. */ - auto fut = base_iertr::now(); - if (first_extent) { - fut = lba_manager->update_mapping( - t, - (lextent->get_laddr() + off).checked_to_laddr(), - lextent->get_length(), - lextent->get_paddr(), - nlextent->get_length(), - nlextent->get_paddr(), - nlextent->get_last_committed_crc(), - nlextent.get() - ).si_then([&refcount](auto c) { - refcount = c; - }); - } else { - ceph_assert(refcount != 0); - fut = lba_manager->alloc_extent( - t, - (lextent->get_laddr() + off).checked_to_laddr(), - *nlextent, - refcount - ).si_then([lextent, nlextent, off](auto mapping) { - ceph_assert(mapping->get_key() == lextent->get_laddr() + off); - ceph_assert(mapping->get_val() == nlextent->get_paddr()); + assert(get_extent_category(extent->get_type()) == data_category_t::DATA); + auto length = extent->get_length(); + return cache->read_extent_maybe_partial( + t, std::move(extent), 0, length + ).si_then([this, FNAME, &t](auto extent) { + assert(extent->is_fully_loaded()); + cache->retire_extent(t, extent); + auto extents = cache->alloc_new_data_extents_by_type( + t, + extent->get_type(), + extent->get_length(), + extent->get_user_hint(), + // get target rewrite generation + extent->get_rewrite_generation()); + return seastar::do_with( + std::move(extents), + 0, + extent->get_length(), + extent_ref_count_t(0), + [this, FNAME, extent, &t] + (auto &extents, auto &off, auto &left, auto &refcount) + { + return trans_intr::do_for_each( + extents, + [extent, this, FNAME, &t, &off, &left, &refcount](auto &_nextent) + { + auto nextent = _nextent->template cast<LogicalCachedExtent>(); + bool first_extent = (off == 0); + ceph_assert(left >= nextent->get_length()); + nextent->rewrite(t, *extent, off); + DEBUGT("rewriting data -- {} to {}", t, *extent, *nextent); + + /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc + * extents since we're going to do it again once we either do the ool write + * or allocate a relative inline addr. TODO: refactor AsyncCleaner to + * avoid this complication. */ + auto fut = base_iertr::now(); + if (first_extent) { + fut = lba_manager->update_mapping( + t, + (extent->get_laddr() + off).checked_to_laddr(), + extent->get_length(), + extent->get_paddr(), + nextent->get_length(), + nextent->get_paddr(), + nextent->get_last_committed_crc(), + nextent.get() + ).si_then([&refcount](auto c) { + refcount = c; + }); + } else { + ceph_assert(refcount != 0); + fut = lba_manager->alloc_extent( + t, + (extent->get_laddr() + off).checked_to_laddr(), + *nextent, + refcount + ).si_then([extent, nextent, off](auto mapping) { + ceph_assert(mapping->get_key() == extent->get_laddr() + off); + ceph_assert(mapping->get_val() == nextent->get_paddr()); + return seastar::now(); + }); + } + return fut.si_then([&off, &left, nextent] { + off += nextent->get_length(); + left -= nextent->get_length(); return seastar::now(); }); - } - return fut.si_then([&off, &left, nlextent] { - off += nlextent->get_length(); - left -= nlextent->get_length(); - return seastar::now(); }); }); }); @@ -714,7 +730,7 @@ TransactionManager::get_extents_if_live( ceph_assert(paddr.get_addr_type() == paddr_types_t::SEGMENT); return cache->get_extent_if_cached(t, paddr, type - ).si_then([=, this, &t](auto extent) + ).si_then([this, FNAME, type, paddr, laddr, len, &t](auto extent) -> get_extents_if_live_ret { if (extent && extent->get_length() == len) { DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}", @@ -731,19 +747,24 @@ TransactionManager::get_extents_if_live( t, laddr, len - ).si_then([=, this, &t](lba_pin_list_t pin_list) { + ).si_then([this, FNAME, type, paddr, laddr, len, &t](lba_pin_list_t pin_list) { return seastar::do_with( std::list<CachedExtentRef>(), - [=, this, &t, pin_list=std::move(pin_list)]( - std::list<CachedExtentRef> &list) mutable + std::move(pin_list), + [this, FNAME, type, paddr, laddr, len, &t] + (std::list<CachedExtentRef> &extent_list, auto& pin_list) { auto paddr_seg_id = paddr.as_seg_paddr().get_segment_id(); return trans_intr::parallel_for_each( pin_list, - [=, this, &list, &t]( - LBAMappingRef &pin) -> Cache::get_extent_iertr::future<> + [this, FNAME, type, paddr_seg_id, &extent_list, &t]( + LBAMappingRef& pin) -> Cache::get_extent_iertr::future<> { + DEBUGT("got pin, try read in parallel ... -- {}", t, *pin); auto pin_paddr = pin->get_val(); + if (pin_paddr.get_addr_type() != paddr_types_t::SEGMENT) { + return seastar::now(); + } auto &pin_seg_paddr = pin_paddr.as_seg_paddr(); auto pin_paddr_seg_id = pin_seg_paddr.get_segment_id(); // auto pin_len = pin->get_length(); @@ -767,16 +788,16 @@ TransactionManager::get_extents_if_live( // ceph_assert(pin_seg_paddr >= paddr && // pin_seg_paddr.add_offset(pin_len) <= paddr.add_offset(len)); return read_pin_by_type(t, std::move(pin), type - ).si_then([&list](auto ret) { - list.emplace_back(std::move(ret)); + ).si_then([&extent_list](auto ret) { + extent_list.emplace_back(std::move(ret)); return seastar::now(); }); - }).si_then([&list, &t, FNAME, type, laddr, len, paddr] { + }).si_then([&extent_list, &t, FNAME, type, laddr, len, paddr] { DEBUGT("{} {}~0x{:x} {} is alive as {} extents", - t, type, laddr, len, paddr, list.size()); + t, type, laddr, len, paddr, extent_list.size()); return get_extents_if_live_ret( interruptible::ready_future_marker{}, - std::move(list)); + std::move(extent_list)); }); }); }).handle_error_interruptible(crimson::ct_error::enoent::handle([] { diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index c7a94a9ef11..e574460894a 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -23,6 +23,7 @@ #include "crimson/os/seastore/logging.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/cache.h" +#include "crimson/os/seastore/root_meta.h" #include "crimson/os/seastore/lba_manager.h" #include "crimson/os/seastore/backref_manager.h" #include "crimson/os/seastore/journal.h" @@ -136,14 +137,66 @@ public: } /** + * maybe_indirect_extent_t + * + * Contains necessary information in case the extent is loaded from an + * indirect pin. + */ + struct indirect_info_t { + extent_len_t intermediate_offset = 0; + extent_len_t length = 0; + }; + template <typename T> + struct maybe_indirect_extent_t { + TCachedExtentRef<T> extent; + std::optional<indirect_info_t> maybe_indirect_info; + bool is_clone = false; + + bool is_indirect() const { + return maybe_indirect_info.has_value(); + } + + ceph::bufferlist get_bl() const { + if (is_indirect()) { + return do_get_indirect_range(0, maybe_indirect_info->length); + } else { + assert(extent->is_fully_loaded()); + bufferlist bl; + bl.append(extent->get_bptr()); + return bl; + } + } + + ceph::bufferlist get_range( + extent_len_t offset, extent_len_t length) const { + if (is_indirect()) { + return do_get_indirect_range(offset, length); + } else { + return extent->get_range(offset, length); + } + } + private: + ceph::bufferlist do_get_indirect_range( + extent_len_t offset, extent_len_t length) const { + assert(is_indirect()); + assert(maybe_indirect_info->intermediate_offset + offset + length <= + extent->get_length()); + assert(offset + length <= maybe_indirect_info->length); + return extent->get_range( + maybe_indirect_info->intermediate_offset + offset, + length); + } + }; + + /** * read_extent * * Read extent of type T at offset~length */ using read_extent_iertr = get_pin_iertr; template <typename T> - using read_extent_ret = read_extent_iertr::future< - TCachedExtentRef<T>>; + using read_extent_ret = + read_extent_iertr::future<maybe_indirect_extent_t<T>>; template <typename T> read_extent_ret<T> read_extent( Transaction &t, @@ -191,12 +244,30 @@ public: } template <typename T> - base_iertr::future<TCachedExtentRef<T>> read_pin( + base_iertr::future<maybe_indirect_extent_t<T>> read_pin( Transaction &t, - LBAMappingRef pin) + LBAMappingRef pin, + extent_len_t partial_off, + extent_len_t partial_len) { + static_assert(is_logical_type(T::TYPE)); + assert(is_aligned(partial_off, get_block_size())); + assert(is_aligned(partial_len, get_block_size())); + + extent_len_t direct_partial_off = partial_off; + bool is_clone = pin->is_clone(); + std::optional<indirect_info_t> maybe_indirect_info; + if (pin->is_indirect()) { + auto intermediate_offset = pin->get_intermediate_offset(); + direct_partial_off = intermediate_offset + partial_off; + maybe_indirect_info = indirect_info_t{ + intermediate_offset, pin->get_length()}; + } + LOG_PREFIX(TransactionManager::read_pin); - SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin); + SUBDEBUGT(seastore_tm, "{} {} 0x{:x}~0x{:x} direct_off=0x{:x} ...", + t, T::TYPE, *pin, partial_off, partial_len, direct_partial_off); + auto fut = base_iertr::make_ready_future<LBAMappingRef>(); if (!pin->is_parent_viewable()) { if (pin->is_parent_valid()) { @@ -213,21 +284,42 @@ public: pin->maybe_fix_pos(); fut = base_iertr::make_ready_future<LBAMappingRef>(std::move(pin)); } - return fut.si_then([&t, this](auto npin) mutable { + return fut.si_then([&t, this, direct_partial_off, partial_len](auto npin) { // checking the lba child must be atomic with creating // and linking the absent child auto ret = get_extent_if_linked<T>(t, std::move(npin)); if (ret.index() == 1) { - return std::move(std::get<1>(ret)); + return std::get<1>(ret + ).si_then([direct_partial_off, partial_len, this, &t](auto extent) { + return cache->read_extent_maybe_partial( + t, std::move(extent), direct_partial_off, partial_len); + }); } else { - return this->pin_to_extent<T>(t, std::move(std::get<0>(ret))); + return this->pin_to_extent<T>( + t, std::move(std::get<0>(ret)), direct_partial_off, partial_len); } - }).si_then([FNAME, &t](TCachedExtentRef<T> ext) { - SUBDEBUGT(seastore_tm, "got {}", t, *ext); - return ext; + }).si_then([FNAME, maybe_indirect_info, is_clone, &t](TCachedExtentRef<T> ext) { + if (maybe_indirect_info.has_value()) { + SUBDEBUGT(seastore_tm, "got indirect +0x{:x}~0x{:x} is_clone={} {}", + t, maybe_indirect_info->intermediate_offset, + maybe_indirect_info->length, is_clone, *ext); + } else { + SUBDEBUGT(seastore_tm, "got direct is_clone={} {}", + t, is_clone, *ext); + } + return maybe_indirect_extent_t<T>{ext, maybe_indirect_info, is_clone}; }); } + template <typename T> + base_iertr::future<maybe_indirect_extent_t<T>> read_pin( + Transaction &t, + LBAMappingRef pin) + { + auto& pin_ref = *pin; + return read_pin<T>(t, std::move(pin), 0, pin_ref.get_length()); + } + /// Obtain mutable copy of extent LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { LOG_PREFIX(TransactionManager::get_mutable_extent); @@ -303,10 +395,6 @@ public: len, placement_hint, INIT_GENERATION); - if (!ext) { - SUBERRORT(seastore_tm, "insufficient space!", t); - return crimson::ct_error::enospc::make(); - } return lba_manager->alloc_extent( t, laddr_hint, @@ -342,10 +430,6 @@ public: len, placement_hint, INIT_GENERATION); - if (exts.empty()) { - SUBERRORT(seastore_tm, "insufficient space!", t); - return crimson::ct_error::enospc::make(); - } return lba_manager->alloc_extents( t, laddr_hint, @@ -362,7 +446,8 @@ public: } template <typename T> - read_extent_ret<T> get_mutable_extent_by_laddr( + get_pin_iertr::future<TCachedExtentRef<T>> + get_mutable_extent_by_laddr( Transaction &t, laddr_t laddr, extent_len_t len) { @@ -374,8 +459,11 @@ public: ceph_assert(!pin->is_clone()); ceph_assert(pin->get_length() == len); return this->read_pin<T>(t, std::move(pin)); - }).si_then([this, &t, FNAME](auto extent) { - auto ext = get_mutable_extent(t, extent)->template cast<T>(); + }).si_then([this, &t, FNAME](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + auto ext = get_mutable_extent( + t, maybe_indirect_extent.extent)->template cast<T>(); SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext); return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>( std::move(ext)); @@ -438,6 +526,7 @@ public: // The according extent might be stable or pending. auto fut = base_iertr::now(); if (!pin->is_indirect()) { + ceph_assert(!pin->is_clone()); if (!pin->is_parent_viewable()) { if (pin->is_parent_valid()) { pin = pin->refresh_with_pending_parent(); @@ -458,7 +547,12 @@ public: fut = fut.si_then([this, &t, &pin] { if (full_extent_integrity_check) { - return read_pin<T>(t, pin->duplicate()); + return read_pin<T>(t, pin->duplicate() + ).si_then([](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + return maybe_indirect_extent.extent; + }); } else { auto ret = get_extent_if_linked<T>(t, pin->duplicate()); if (ret.index() == 1) { @@ -475,6 +569,7 @@ public: ? (ext && ext->is_fully_loaded()) : true); std::optional<ceph::bufferptr> original_bptr; + // TODO: preserve the bufferspace if partially loaded if (ext && ext->is_fully_loaded()) { ceph_assert(!ext->is_mutable()); ceph_assert(ext->get_length() >= original_len); @@ -646,8 +741,9 @@ public: TransactionRef create_transaction( Transaction::src_t src, const char* name, + cache_hint_t cache_hint = CACHE_HINT_TOUCH, bool is_weak=false) final { - return cache->create_transaction(src, name, is_weak); + return cache->create_transaction(src, name, cache_hint, is_weak); } using ExtentCallbackInterface::submit_transaction_direct_ret; @@ -690,9 +786,14 @@ public: const std::string &key) { return cache->get_root( t - ).si_then([&key, &t](auto root) { + ).si_then([&t, this](auto root) { + return read_extent<RootMetaBlock>(t, root->root.meta); + }).si_then([key, &t](auto maybe_indirect_extent) { LOG_PREFIX(TransactionManager::read_root_meta); - auto meta = root->root.get_meta(); + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + auto& mblock = maybe_indirect_extent.extent; + auto meta = mblock->get_meta(); auto iter = meta.find(key); if (iter == meta.end()) { SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key); @@ -701,7 +802,35 @@ public: SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second); return seastar::make_ready_future<read_root_meta_bare>(iter->second); } - }); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error!"} + ); + } + + /** + * init_root_meta + * + * create the root meta block + */ + using init_root_meta_iertr = base_iertr; + using init_root_meta_ret = init_root_meta_iertr::future<>; + init_root_meta_ret init_root_meta(Transaction &t) { + return alloc_non_data_extent<RootMetaBlock>( + t, L_ADDR_MIN, RootMetaBlock::SIZE + ).si_then([this, &t](auto meta) { + meta->set_meta(RootMetaBlock::meta_t{}); + return cache->get_root(t + ).si_then([this, &t, meta](auto root) { + auto mroot = cache->duplicate_for_write( + t, root)->template cast<RootBlock>(); + mroot->root.meta = meta->get_laddr(); + return seastar::now(); + }); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error!"} + ); } /** @@ -719,15 +848,24 @@ public: SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value); return cache->get_root( t - ).si_then([this, &t, &key, &value](RootBlockRef root) { - root = cache->duplicate_for_write(t, root)->cast<RootBlock>(); - - auto meta = root->root.get_meta(); + ).si_then([this, &t](RootBlockRef root) { + return read_extent<RootMetaBlock>(t, root->root.meta); + }).si_then([this, key, value, &t](auto maybe_indirect_extent) { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + auto& mblock = maybe_indirect_extent.extent; + mblock = get_mutable_extent(t, mblock + )->template cast<RootMetaBlock>(); + + auto meta = mblock->get_meta(); meta[key] = value; - root->root.set_meta(meta); + mblock->set_meta(meta); return seastar::now(); - }); + }).handle_error_interruptible( + crimson::ct_error::input_output_error::pass_further{}, + crimson::ct_error::assert_all{"unexpected error!"} + ); } /** @@ -817,7 +955,7 @@ private: shard_stats_t& shard_stats; template <typename T> - std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>> + std::variant<LBAMappingRef, get_child_ifut<T>> get_extent_if_linked( Transaction &t, LBAMappingRef pin) @@ -827,7 +965,8 @@ private: // and linking the absent child auto v = pin->get_logical_extent(t); if (v.has_child()) { - return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) { + return v.get_child_fut( + ).si_then([pin=std::move(pin)](auto extent) { #ifndef NDEBUG auto lextent = extent->template cast<LogicalCachedExtent>(); auto pin_laddr = pin->get_key(); @@ -849,11 +988,17 @@ private: extent_types_t type) { ceph_assert(!pin->parent_modified()); + assert(!pin->is_indirect()); + // Note: pin might be a clone auto v = pin->get_logical_extent(t); // checking the lba child must be atomic with creating // and linking the absent child if (v.has_child()) { - return std::move(v.get_child_fut()); + return std::move(v.get_child_fut() + ).si_then([type](auto ext) { + ceph_assert(ext->get_type() == type); + return ext; + }); } else { return pin_to_extent_by_type(t, std::move(pin), type); } @@ -877,6 +1022,7 @@ private: * pin_to_extent * * Get extent mapped at pin. + * partially load buffer from direct_partial_off~partial_len if not present. */ using pin_to_extent_iertr = base_iertr; template <typename T> @@ -885,18 +1031,28 @@ private: template <typename T> pin_to_extent_ret<T> pin_to_extent( Transaction &t, - LBAMappingRef pin) { - LOG_PREFIX(TransactionManager::pin_to_extent); - SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin); + LBAMappingRef pin, + extent_len_t direct_partial_off, + extent_len_t partial_len) { static_assert(is_logical_type(T::TYPE)); using ret = pin_to_extent_ret<T>; auto &pref = *pin; + auto direct_length = pref.is_indirect() ? + pref.get_intermediate_length() : + pref.get_length(); + if (full_extent_integrity_check) { + direct_partial_off = 0; + partial_len = direct_length; + } + LOG_PREFIX(TransactionManager::pin_to_extent); + SUBTRACET(seastore_tm, "getting absent extent from pin {}, 0x{:x}~0x{:x} ...", + t, *pin, direct_partial_off, partial_len); return cache->get_absent_extent<T>( t, pref.get_val(), - pref.is_indirect() ? - pref.get_intermediate_length() : - pref.get_length(), + direct_length, + direct_partial_off, + partial_len, [&pref] (T &extent) mutable { assert(!extent.has_laddr()); @@ -907,30 +1063,33 @@ private: extent.maybe_set_intermediate_laddr(pref); } ).si_then([FNAME, &t, pin=std::move(pin), this](auto ref) mutable -> ret { - auto crc = ref->calc_crc32c(); - SUBTRACET( - seastore_tm, - "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}", - t, - *ref, - pin->get_checksum(), - crc); - assert(ref->is_fully_loaded()); - bool inconsistent = false; - if (full_extent_integrity_check) { - inconsistent = (pin->get_checksum() != crc); - } else { // !full_extent_integrity_check: remapped extent may be skipped - inconsistent = !(pin->get_checksum() == 0 || - pin->get_checksum() == crc); - } - if (unlikely(inconsistent)) { - SUBERRORT(seastore_tm, - "extent checksum inconsistent, recorded: {}, actual: {}, {}", + if (ref->is_fully_loaded()) { + auto crc = ref->calc_crc32c(); + SUBTRACET( + seastore_tm, + "got extent -- {}, chksum in the lba tree: 0x{:x}, actual chksum: 0x{:x}", t, + *ref, pin->get_checksum(), - crc, - *ref); - ceph_abort(); + crc); + bool inconsistent = false; + if (full_extent_integrity_check) { + inconsistent = (pin->get_checksum() != crc); + } else { // !full_extent_integrity_check: remapped extent may be skipped + inconsistent = !(pin->get_checksum() == 0 || + pin->get_checksum() == crc); + } + if (unlikely(inconsistent)) { + SUBERRORT(seastore_tm, + "extent checksum inconsistent, recorded: 0x{:x}, actual: 0x{:x}, {}", + t, + pin->get_checksum(), + crc, + *ref); + ceph_abort(); + } + } else { + assert(!full_extent_integrity_check); } return pin_to_extent_ret<T>( interruptible::ready_future_marker{}, @@ -955,14 +1114,21 @@ private: t, *pin, type); assert(is_logical_type(type)); auto &pref = *pin; + laddr_t direct_key; + extent_len_t direct_length; + if (pref.is_indirect()) { + direct_key = pref.get_intermediate_base(); + direct_length = pref.get_intermediate_length(); + } else { + direct_key = pref.get_key(); + direct_length = pref.get_length(); + } return cache->get_absent_extent_by_type( t, type, pref.get_val(), - pref.get_key(), - pref.is_indirect() ? - pref.get_intermediate_length() : - pref.get_length(), + direct_key, + direct_length, [&pref](CachedExtent &extent) mutable { auto &lextent = static_cast<LogicalCachedExtent&>(extent); assert(!lextent.has_laddr()); @@ -977,7 +1143,7 @@ private: auto crc = ref->calc_crc32c(); SUBTRACET( seastore_tm, - "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}", + "got extent -- {}, chksum in the lba tree: 0x{:x}, actual chksum: 0x{:x}", t, *ref, pin->get_checksum(), @@ -992,7 +1158,7 @@ private: } if (unlikely(inconsistent)) { SUBERRORT(seastore_tm, - "extent checksum inconsistent, recorded: {}, actual: {}, {}", + "extent checksum inconsistent, recorded: 0x{:x}, actual: 0x{:x}, {}", t, pin->get_checksum(), crc, diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h index 522a93a1ddc..ce649303d4f 100644 --- a/src/crimson/osd/backfill_facades.h +++ b/src/crimson/osd/backfill_facades.h @@ -36,6 +36,10 @@ struct PeeringFacade final : BackfillState::PeeringFacade { return peering_state.get_info().log_tail; } + const PGLog& get_pg_log() const override { + return peering_state.get_pg_log(); + } + void scan_log_after(eversion_t v, scan_log_func_t f) const override { peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f)); } @@ -73,7 +77,14 @@ struct PGFacade final : BackfillState::PGFacade { return pg.projected_last_update; } + const PGLog::IndexedLog& get_projected_log() const override { + return pg.projected_log; + } + PGFacade(PG& pg) : pg(pg) {} + std::ostream &print(std::ostream &out) const override { + return out << pg; + } }; } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc index 018e58b68f8..f957f072c93 100644 --- a/src/crimson/osd/backfill_state.cc +++ b/src/crimson/osd/backfill_state.cc @@ -8,11 +8,7 @@ #include "crimson/osd/backfill_state.h" #include "osd/osd_types_fmt.h" -namespace { - seastar::logger& logger() { - return crimson::get_logger(ceph_subsys_osd); - } -} +SET_SUBSYS(osd); namespace crimson::osd { @@ -27,22 +23,23 @@ BackfillState::BackfillState( progress_tracker( std::make_unique<BackfillState::ProgressTracker>(backfill_machine)) { - logger().debug("{}:{}", __func__, __LINE__); + LOG_PREFIX(BackfillState::BackfillState); + DEBUGDPP("", *backfill_machine.pg); backfill_machine.initiate(); } template <class S> BackfillState::StateHelper<S>::StateHelper() { - logger().debug("enter {}", - boost::typeindex::type_id<S>().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUGDPP("enter {}", pg(), boost::typeindex::type_id<S>().pretty_name()); } template <class S> BackfillState::StateHelper<S>::~StateHelper() { - logger().debug("exit {}", - boost::typeindex::type_id<S>().pretty_name()); + LOG_PREFIX(BackfillState::StateHelper); + DEBUG("exit {}", boost::typeindex::type_id<S>().pretty_name()); } BackfillState::~BackfillState() = default; @@ -63,13 +60,16 @@ BackfillState::BackfillMachine::~BackfillMachine() = default; BackfillState::Initial::Initial(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Initial::Initial); backfill_state().last_backfill_started = peering_state().earliest_backfill(); - logger().debug("{}: bft={} from {}", - __func__, peering_state().get_backfill_targets(), - backfill_state().last_backfill_started); + DEBUGDPP("{}: bft={} from {}", + pg(), + __func__, + peering_state().get_backfill_targets(), + backfill_state().last_backfill_started); for (const auto& bt : peering_state().get_backfill_targets()) { - logger().debug("{}: target shard {} from {}", - __func__, bt, peering_state().get_peer_last_backfill(bt)); + DEBUGDPP("{}: target shard {} from {}", + pg(), __func__, bt, peering_state().get_peer_last_backfill(bt)); } ceph_assert(peering_state().get_backfill_targets().size()); ceph_assert(!backfill_state().last_backfill_started.is_max()); @@ -80,7 +80,8 @@ BackfillState::Initial::Initial(my_context ctx) boost::statechart::result BackfillState::Initial::react(const BackfillState::Triggered& evt) { - logger().debug("{}: backfill triggered", __func__); + LOG_PREFIX(BackfillState::Initial::react::Triggered); + DEBUGDPP("", pg()); ceph_assert(backfill_state().last_backfill_started == \ peering_state().earliest_backfill()); ceph_assert(peering_state().is_backfilling()); @@ -93,26 +94,10 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt) if (Enqueuing::all_enqueued(peering_state(), backfill_state().backfill_info, backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); + DEBUGDPP("switching to Done state", pg()); return transit<BackfillState::Done>(); } else { - logger().debug("{}: switching to Enqueuing state", __func__); - return transit<BackfillState::Enqueuing>(); - } -} - -boost::statechart::result -BackfillState::Cancelled::react(const BackfillState::Triggered& evt) -{ - logger().debug("{}: backfill re-triggered", __func__); - ceph_assert(peering_state().is_backfilling()); - if (Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - logger().debug("{}: switching to Done state", __func__); - return transit<BackfillState::Done>(); - } else { - logger().debug("{}: switching to Enqueuing state", __func__); + DEBUGDPP("switching to Enqueuing state", pg()); return transit<BackfillState::Enqueuing>(); } } @@ -120,12 +105,12 @@ BackfillState::Cancelled::react(const BackfillState::Triggered& evt) // -- Enqueuing void BackfillState::Enqueuing::maybe_update_range() { + LOG_PREFIX(BackfillState::Enqueuing::maybe_update_range); if (auto& primary_bi = backfill_state().backfill_info; primary_bi.version >= pg().get_projected_last_update()) { - logger().info("{}: bi is current", __func__); + INFODPP("bi is current", pg()); ceph_assert(primary_bi.version == pg().get_projected_last_update()); } else if (primary_bi.version >= peering_state().get_log_tail()) { -#if 0 if (peering_state().get_pg_log().get_log().empty() && pg().get_projected_log().empty()) { /* Because we don't move log_tail on split, the log might be @@ -137,30 +122,32 @@ void BackfillState::Enqueuing::maybe_update_range() ceph_assert(primary_bi.version == eversion_t()); return; } -#endif - logger().debug("{}: bi is old, ({}) can be updated with log to {}", - __func__, - primary_bi.version, - pg().get_projected_last_update()); - logger().debug("{}: scanning pg log first", __func__); - peering_state().scan_log_after(primary_bi.version, + DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}", + pg(), + primary_bi.version, + pg().get_projected_last_update()); + auto func = [&](const pg_log_entry_t& e) { - logger().debug("maybe_update_range(lambda): updating from version {}", - e.version); + DEBUGDPP("maybe_update_range(lambda): updating from version {}", + pg(), e.version); if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) { if (e.is_update()) { - logger().debug("maybe_update_range(lambda): {} updated to ver {}", - e.soid, e.version); + DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}", + pg(), e.soid, e.version); primary_bi.objects.erase(e.soid); primary_bi.objects.insert(std::make_pair(e.soid, e.version)); } else if (e.is_delete()) { - logger().debug("maybe_update_range(lambda): {} removed", - e.soid); + DEBUGDPP("maybe_update_range(lambda): {} removed", + pg(), e.soid); primary_bi.objects.erase(e.soid); } } - }); + }; + DEBUGDPP("{}: scanning pg log first", pg()); + peering_state().scan_log_after(primary_bi.version, func); + DEBUGDPP("{}: scanning projected log", pg()); + pg().get_projected_log().scan_log_after(primary_bi.version, func); primary_bi.version = pg().get_projected_last_update(); } else { ceph_abort_msg( @@ -243,6 +230,7 @@ void BackfillState::Enqueuing::trim_backfilled_object_from_intervals( BackfillState::Enqueuing::result_t BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) { + LOG_PREFIX(BackfillState::Enqueuing::remove_on_peers); // set `new_last_backfill_started` to `check` result_t result { {}, check }; for (const auto& bt : peering_state().get_backfill_targets()) { @@ -254,8 +242,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) backfill_listener().enqueue_drop(bt, pbi.begin, version); } } - logger().debug("{}: BACKFILL removing {} from peers {}", - __func__, check, result.pbi_targets); + DEBUGDPP("BACKFILL removing {} from peers {}", + pg(), check, result.pbi_targets); ceph_assert(!result.pbi_targets.empty()); return result; } @@ -263,7 +251,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check) BackfillState::Enqueuing::result_t BackfillState::Enqueuing::update_on_peers(const hobject_t& check) { - logger().debug("{}: check={}", __func__, check); + LOG_PREFIX(BackfillState::Enqueuing::update_on_peers); + DEBUGDPP("check={}", pg(), check); const auto& primary_bi = backfill_state().backfill_info; result_t result { {}, primary_bi.begin }; std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills; @@ -324,6 +313,7 @@ bool BackfillState::Enqueuing::Enqueuing::all_emptied( BackfillState::Enqueuing::Enqueuing(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::Enqueuing::Enqueuing); auto& primary_bi = backfill_state().backfill_info; // update our local interval to cope with recent changes @@ -333,8 +323,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) // that backfill will be spinning here over and over. For the sake // of performance and complexity we don't synchronize with entire PG. // similar can happen in classical OSD. - logger().warn("{}: bi is old, rescanning of local backfill_info", - __func__); + WARNDPP("bi is old, rescanning of local backfill_info", pg()); post_event(RequestPrimaryScanning{}); return; } else { @@ -346,13 +335,14 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) primary_bi)) { // need to grab one another chunk of the object namespace and restart // the queueing. - logger().debug("{}: reached end for current local chunk", __func__); + DEBUGDPP("reached end for current local chunk", pg()); post_event(RequestPrimaryScanning{}); return; } do { if (!backfill_listener().budget_available()) { + DEBUGDPP("throttle failed, turning to Waiting", pg()); post_event(RequestWaiting{}); return; } else if (should_rescan_replicas(backfill_state().peer_backfill_info, @@ -378,28 +368,38 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx) trim_backfilled_object_from_intervals(std::move(result), backfill_state().last_backfill_started, backfill_state().peer_backfill_info); - } else { + backfill_listener().maybe_flush(); + } else if (!primary_bi.empty()) { auto result = update_on_peers(check); trim_backfilled_object_from_intervals(std::move(result), backfill_state().last_backfill_started, backfill_state().peer_backfill_info); - if (!primary_bi.empty()) { - primary_bi.pop_front(); - } + primary_bi.pop_front(); + backfill_listener().maybe_flush(); + } else { + break; } - backfill_listener().maybe_flush(); } while (!all_emptied(primary_bi, backfill_state().peer_backfill_info)); - if (backfill_state().progress_tracker->tracked_objects_completed() - && Enqueuing::all_enqueued(peering_state(), - backfill_state().backfill_info, - backfill_state().peer_backfill_info)) { - backfill_state().last_backfill_started = hobject_t::get_max(); - backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + if (should_rescan_primary(backfill_state().peer_backfill_info, + primary_bi)) { + // need to grab one another chunk of the object namespace and restart + // the queueing. + DEBUGDPP("reached end for current local chunk", pg()); + post_event(RequestPrimaryScanning{}); + return; + } else { + if (backfill_state().progress_tracker->tracked_objects_completed() + && Enqueuing::all_enqueued(peering_state(), + backfill_state().backfill_info, + backfill_state().peer_backfill_info)) { + backfill_state().last_backfill_started = hobject_t::get_max(); + backfill_listener().update_peers_last_backfill(hobject_t::get_max()); + } + DEBUGDPP("reached end for both local and all peers " + "but still has in-flight operations", pg()); + post_event(RequestWaiting{}); } - logger().debug("{}: reached end for both local and all peers " - "but still has in-flight operations", __func__); - post_event(RequestWaiting{}); } // -- PrimaryScanning @@ -414,16 +414,45 @@ BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx) boost::statechart::result BackfillState::PrimaryScanning::react(PrimaryScanned evt) { - logger().debug("{}", __func__); + LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned); + DEBUGDPP("", pg()); backfill_state().backfill_info = std::move(evt.result); - return transit<Enqueuing>(); + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill); + DEBUGDPP("suspended within PrimaryScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::PrimaryScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } boost::statechart::result BackfillState::PrimaryScanning::react(ObjectPushed evt) { - logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::PrimaryScanning::react::ObjectPushed); + DEBUGDPP("PrimaryScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -441,11 +470,11 @@ bool BackfillState::ReplicasScanning::replica_needs_scan( BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx) : my_base(ctx) { + LOG_PREFIX(BackfillState::ReplicasScanning::ReplicasScanning); for (const auto& bt : peering_state().get_backfill_targets()) { if (const auto& pbi = backfill_state().peer_backfill_info.at(bt); replica_needs_scan(pbi, backfill_state().backfill_info)) { - logger().debug("{}: scanning peer osd.{} from {}", - __func__, bt, pbi.end); + DEBUGDPP("scanning peer osd.{} from {}", pg(), bt, pbi.end); backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{}); ceph_assert(waiting_on_backfill.find(bt) == \ @@ -467,8 +496,9 @@ BackfillState::ReplicasScanning::~ReplicasScanning() boost::statechart::result BackfillState::ReplicasScanning::react(ReplicaScanned evt) { - logger().debug("{}: got scan result from osd={}, result={}", - __func__, evt.from, evt.result); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ReplicaScanned); + DEBUGDPP("got scan result from osd={}, result={}", + pg(), evt.from, evt.result); // TODO: maybe we'll be able to move waiting_on_backfill from // the machine to the state. ceph_assert(peering_state().is_backfill_target(evt.from)); @@ -477,12 +507,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) if (waiting_on_backfill.empty()) { ceph_assert(backfill_state().peer_backfill_info.size() == \ peering_state().get_backfill_targets().size()); - return transit<Enqueuing>(); + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } } } else { - // we canceled backfill for a while due to a too full, and this + // we suspended backfill for a while due to a too full, and this // is an extra response from a non-too-full peer - logger().debug("{}: canceled backfill (too full?)", __func__); + DEBUGDPP("suspended backfill (too full?)", pg()); } return discard_event(); } @@ -490,17 +525,30 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt) boost::statechart::result BackfillState::ReplicasScanning::react(CancelBackfill evt) { - logger().debug("{}: cancelled within ReplicasScanning", - __func__); - waiting_on_backfill.clear(); - return transit<Cancelled>(); + LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill); + DEBUGDPP("suspended within ReplicasScanning", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::ReplicasScanning::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } boost::statechart::result BackfillState::ReplicasScanning::react(ObjectPushed evt) { - logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::ReplicasScanning::react::ObjectPushed); + DEBUGDPP("ReplicasScanning::react() on ObjectPushed; evt.object={}", + pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true); return discard_event(); } @@ -515,17 +563,45 @@ BackfillState::Waiting::Waiting(my_context ctx) boost::statechart::result BackfillState::Waiting::react(ObjectPushed evt) { - logger().debug("Waiting::react() on ObjectPushed; evt.object={}", - evt.object); + LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed); + DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object); backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false); - return transit<Enqueuing>();; + if (!backfill_state().is_suspended()) { + return transit<Enqueuing>(); + } else { + DEBUGDPP("backfill suspended, not going Enqueuing", pg()); + backfill_state().go_enqueuing_on_resume(); + } + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(CancelBackfill evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill); + DEBUGDPP("suspended within Waiting", pg()); + backfill_state().on_suspended(); + return discard_event(); +} + +boost::statechart::result +BackfillState::Waiting::react(Triggered evt) +{ + LOG_PREFIX(BackfillState::Waiting::react::Triggered); + ceph_assert(backfill_state().is_suspended()); + if (backfill_state().on_resumed()) { + DEBUGDPP("Backfill resumed, going Enqueuing", pg()); + return transit<Enqueuing>(); + } + return discard_event(); } // -- Done BackfillState::Done::Done(my_context ctx) : my_base(ctx) { - logger().info("{}: backfill is done", __func__); + LOG_PREFIX(BackfillState::Done::Done); + INFODPP("backfill is done", pg()); backfill_listener().backfilled(); } @@ -535,13 +611,6 @@ BackfillState::Crashed::Crashed() ceph_abort_msg("{}: this should not happen"); } -// -- Cancelled -BackfillState::Cancelled::Cancelled(my_context ctx) - : my_base(ctx) -{ - ceph_assert(peering_state().get_backfill_targets().size()); -} - // ProgressTracker is an intermediary between the BackfillListener and // BackfillMachine + its states. All requests to push or drop an object // are directed through it. The same happens with notifications about @@ -575,8 +644,8 @@ void BackfillState::ProgressTracker::complete_to( const pg_stat_t& stats, bool may_push_to_max) { - logger().debug("{}: obj={}", - __func__, obj); + LOG_PREFIX(BackfillState::ProgressTracker::complete_to); + DEBUGDPP("obj={}", pg(), obj); if (auto completion_iter = registry.find(obj); completion_iter != std::end(registry)) { completion_iter->second = \ @@ -609,4 +678,27 @@ void BackfillState::ProgressTracker::complete_to( } } +void BackfillState::enqueue_standalone_push( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) { + progress_tracker->enqueue_push(obj); + backfill_machine.backfill_listener.enqueue_push(obj, v, peers); +} + +void BackfillState::enqueue_standalone_delete( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) +{ + progress_tracker->enqueue_drop(obj); + for (auto bt : peers) { + backfill_machine.backfill_listener.enqueue_drop(bt, obj, v); + } +} + +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) { + return pg.print(out); +} + } // namespace crimson::osd diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h index ddc0cbf7355..517a02ea4df 100644 --- a/src/crimson/osd/backfill_state.h +++ b/src/crimson/osd/backfill_state.h @@ -14,6 +14,7 @@ #include <boost/statechart/transition.hpp> #include "osd/recovery_types.h" +#include "osd/PGLog.h" namespace crimson::osd { @@ -61,6 +62,8 @@ struct BackfillState { struct CancelBackfill : sc::event<CancelBackfill> { }; + struct ThrottleAcquired : sc::event<ThrottleAcquired> { + }; private: // internal events struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> { @@ -135,34 +138,10 @@ public: explicit Crashed(); }; - struct Cancelled : sc::state<Cancelled, BackfillMachine>, - StateHelper<Cancelled> { - using reactions = boost::mpl::list< - sc::custom_reaction<Triggered>, - sc::custom_reaction<PrimaryScanned>, - sc::custom_reaction<ReplicaScanned>, - sc::custom_reaction<ObjectPushed>, - sc::transition<sc::event_base, Crashed>>; - explicit Cancelled(my_context); - // resume after triggering backfill by on_activate_complete(). - // transit to Enqueuing. - sc::result react(const Triggered&); - sc::result react(const PrimaryScanned&) { - return discard_event(); - } - sc::result react(const ReplicaScanned&) { - return discard_event(); - } - sc::result react(const ObjectPushed&) { - return discard_event(); - } - }; - struct Initial : sc::state<Initial, BackfillMachine>, StateHelper<Initial> { using reactions = boost::mpl::list< sc::custom_reaction<Triggered>, - sc::transition<CancelBackfill, Cancelled>, sc::transition<sc::event_base, Crashed>>; explicit Initial(my_context); // initialize after triggering backfill by on_activate_complete(). @@ -173,12 +152,9 @@ public: struct Enqueuing : sc::state<Enqueuing, BackfillMachine>, StateHelper<Enqueuing> { using reactions = boost::mpl::list< - sc::transition<CancelBackfill, Cancelled>, sc::transition<RequestPrimaryScanning, PrimaryScanning>, sc::transition<RequestReplicasScanning, ReplicasScanning>, sc::transition<RequestWaiting, Waiting>, - sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, sc::transition<sc::event_base, Crashed>>; explicit Enqueuing(my_context); @@ -236,12 +212,15 @@ public: sc::custom_reaction<ObjectPushed>, sc::custom_reaction<PrimaryScanned>, sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, + sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, sc::transition<sc::event_base, Crashed>>; explicit PrimaryScanning(my_context); sc::result react(ObjectPushed); // collect scanning result and transit to Enqueuing. sc::result react(PrimaryScanned); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>, @@ -250,6 +229,7 @@ public: sc::custom_reaction<ObjectPushed>, sc::custom_reaction<ReplicaScanned>, sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, sc::transition<RequestDone, Done>, sc::transition<sc::event_base, Crashed>>; explicit ReplicasScanning(my_context); @@ -258,6 +238,7 @@ public: sc::result react(ObjectPushed); sc::result react(ReplicaScanned); sc::result react(CancelBackfill); + sc::result react(Triggered); // indicate whether a particular peer should be scanned to retrieve // BackfillInterval for new range of hobject_t namespace. @@ -276,17 +257,25 @@ public: using reactions = boost::mpl::list< sc::custom_reaction<ObjectPushed>, sc::transition<RequestDone, Done>, - sc::transition<CancelBackfill, Cancelled>, + sc::custom_reaction<CancelBackfill>, + sc::custom_reaction<Triggered>, + sc::transition<ThrottleAcquired, Enqueuing>, sc::transition<sc::event_base, Crashed>>; explicit Waiting(my_context); sc::result react(ObjectPushed); + sc::result react(CancelBackfill); + sc::result react(Triggered); }; struct Done : sc::state<Done, BackfillMachine>, StateHelper<Done> { using reactions = boost::mpl::list< + sc::custom_reaction<CancelBackfill>, sc::transition<sc::event_base, Crashed>>; explicit Done(my_context); + sc::result react(CancelBackfill) { + return discard_event(); + } }; BackfillState(BackfillListener& backfill_listener, @@ -299,6 +288,20 @@ public: backfill_machine.process_event(*std::move(evt)); } + void enqueue_standalone_push( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + void enqueue_standalone_delete( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + + + bool is_triggered() const { + return backfill_machine.triggering_event() != nullptr; + } + hobject_t get_last_backfill_started() const { return last_backfill_started; } @@ -311,6 +314,26 @@ public: } } private: + struct backfill_suspend_state_t { + bool suspended = false; + bool should_go_enqueuing = false; + } backfill_suspend_state; + bool is_suspended() const { + return backfill_suspend_state.suspended; + } + void on_suspended() { + ceph_assert(!is_suspended()); + backfill_suspend_state = {true, false}; + } + bool on_resumed() { + auto go_enqueuing = backfill_suspend_state.should_go_enqueuing; + backfill_suspend_state = {false, false}; + return go_enqueuing; + } + void go_enqueuing_on_resume() { + ceph_assert(is_suspended()); + backfill_suspend_state.should_go_enqueuing = true; + } hobject_t last_backfill_started; BackfillInterval backfill_info; std::map<pg_shard_t, BackfillInterval> peer_backfill_info; @@ -363,6 +386,7 @@ struct BackfillState::PeeringFacade { virtual hobject_t earliest_backfill() const = 0; virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0; virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0; + virtual const PGLog& get_pg_log() const = 0; virtual const eversion_t& get_last_update() const = 0; virtual const eversion_t& get_log_tail() const = 0; @@ -388,8 +412,12 @@ struct BackfillState::PeeringFacade { // of behaviour that must be provided by a unit test's mock. struct BackfillState::PGFacade { virtual const eversion_t& get_projected_last_update() const = 0; + virtual const PGLog::IndexedLog& get_projected_log() const = 0; + + virtual std::ostream &print(std::ostream &out) const = 0; virtual ~PGFacade() {} }; +std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg); class BackfillState::ProgressTracker { // TODO: apply_stat, @@ -416,6 +444,9 @@ class BackfillState::ProgressTracker { BackfillListener& backfill_listener() { return backfill_machine.backfill_listener; } + PGFacade& pg() { + return *backfill_machine.pg; + } public: ProgressTracker(BackfillMachine& backfill_machine) @@ -430,3 +461,9 @@ public: }; } // namespace crimson::osd + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<crimson::osd::BackfillState::PGFacade> + : fmt::ostream_formatter {}; +#endif + diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc index 32eaaf02b3f..007d0bf35f3 100644 --- a/src/crimson/osd/ec_backend.cc +++ b/src/crimson/osd/ec_backend.cc @@ -26,6 +26,7 @@ ECBackend::_read(const hobject_t& hoid, ECBackend::rep_op_fut_t ECBackend::submit_transaction(const std::set<pg_shard_t> &pg_shards, const hobject_t& hoid, + crimson::osd::ObjectContextRef&& new_clone, ceph::os::Transaction&& txn, osd_op_params_t&& osd_op_p, epoch_t min_epoch, epoch_t max_epoch, diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h index 90a7e2b1f4d..b14c78c9fc4 100644 --- a/src/crimson/osd/ec_backend.h +++ b/src/crimson/osd/ec_backend.h @@ -28,6 +28,7 @@ private: rep_op_fut_t submit_transaction(const std::set<pg_shard_t> &pg_shards, const hobject_t& hoid, + crimson::osd::ObjectContextRef&& new_clone, ceph::os::Transaction&& txn, osd_op_params_t&& req, epoch_t min_epoch, epoch_t max_epoch, diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc index 03986952b4f..5902fc8c14f 100644 --- a/src/crimson/osd/heartbeat.cc +++ b/src/crimson/osd/heartbeat.cc @@ -9,6 +9,7 @@ #include "messages/MOSDPing.h" #include "messages/MOSDFailure.h" +#include "msg/msg_types.h" #include "crimson/common/config_proxy.h" #include "crimson/common/formatter.h" diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc index fa387804dcd..0bfd3e2266b 100644 --- a/src/crimson/osd/main.cc +++ b/src/crimson/osd/main.cc @@ -24,6 +24,7 @@ #include "crimson/common/buffer_io.h" #include "crimson/common/config_proxy.h" #include "crimson/common/fatal_signal.h" +#include "crimson/common/perf_counters_collection.h" #include "crimson/mon/MonClient.h" #include "crimson/net/Messenger.h" #include "crimson/osd/stop_signal.h" @@ -201,7 +202,7 @@ int main(int argc, const char* argv[]) true); } auto store = crimson::os::FuturizedStore::create( - local_conf().get_val<std::string>("osd_objectstore"), + local_conf().get_val<std::string>("crimson_osd_objectstore"), local_conf().get_val<std::string>("osd_data"), local_conf().get_config_values()); diff --git a/src/crimson/osd/main_config_bootstrap_helpers.cc b/src/crimson/osd/main_config_bootstrap_helpers.cc index 3596929527f..e4920eb870f 100644 --- a/src/crimson/osd/main_config_bootstrap_helpers.cc +++ b/src/crimson/osd/main_config_bootstrap_helpers.cc @@ -17,10 +17,13 @@ #include "crimson/common/buffer_io.h" #include "crimson/common/config_proxy.h" #include "crimson/common/fatal_signal.h" +#include "crimson/common/perf_counters_collection.h" #include "crimson/mon/MonClient.h" #include "crimson/net/Messenger.h" #include "crimson/osd/main_config_bootstrap_helpers.h" +#include <sys/wait.h> // for waitpid() + using namespace std::literals; using crimson::common::local_conf; using crimson::common::sharded_conf; diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h index e17af91e3ad..4195e5dc597 100644 --- a/src/crimson/osd/object_context.h +++ b/src/crimson/osd/object_context.h @@ -9,6 +9,7 @@ #include <seastar/core/shared_future.hh> #include <seastar/core/shared_ptr.hh> +#include "common/fmt_common.h" #include "common/intrusive_lru.h" #include "osd/object_state.h" #include "crimson/common/exception.h" @@ -73,6 +74,8 @@ public: using watch_key_t = std::pair<uint64_t, entity_name_t>; std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers; + CommonOBCPipeline obc_pipeline; + ObjectContext(hobject_t hoid) : lock(hoid), obs(std::move(hoid)) {} @@ -128,30 +131,49 @@ public: } bool is_valid() const { - return !invalidated_by_interval_change; + return !invalidated; } private: - template <typename Lock, typename Func> - auto _with_lock(Lock& lock, Func&& func) { - return lock.lock( - ).then([&lock, func=std::forward<Func>(func), obc=Ref(this)]() mutable { - return seastar::futurize_invoke( - func - ).finally([&lock, obc=std::move(obc)] { - /* We chain the finally block here because it's possible for lock.lock() - * above to fail due to a call to ObjectContext::interrupt, which calls - * tri_mutex::abort. In the event of such an error, the lock isn't - * actually taken and calling unlock() would be incorrect. */ - lock.unlock(); - }); - }); - } - boost::intrusive::list_member_hook<> obc_accessing_hook; uint64_t list_link_cnt = 0; + + /** + * loading_started + * + * ObjectContext instances may be used for pipeline stages + * prior to actually being loaded. + * + * ObjectContextLoader::load_and_lock* use loading_started + * to determine whether to initiate loading or simply take + * the desired lock directly. + * + * If loading_started is not set, the task must set it and + * (syncronously) take an exclusive lock. That exclusive lock + * must be held until the loading completes, at which point the + * lock may be relaxed or released. + * + * If loading_started is set, it is safe to directly take + * the desired lock, once the lock is obtained loading may + * be assumed to be complete. + * + * loading_started, once set, remains set for the lifetime + * of the object. + */ + bool loading_started = false; + + /// true once set_*_state has been called, used for debugging bool fully_loaded = false; - bool invalidated_by_interval_change = false; + + /** + * invalidated + * + * Set to true upon eviction from cache. This happens to all + * cached obc's upon interval change and to the target of + * a repop received on a replica to ensure that the cached + * state is refreshed upon subsequent replica read. + */ + bool invalidated = false; friend class ObjectContextRegistry; friend class ObjectContextLoader; @@ -172,122 +194,20 @@ public: } } + template <typename FormatContext> + auto fmt_print_ctx(FormatContext & ctx) const { + return fmt::format_to( + ctx.out(), "ObjectContext({}, oid={}, refcount={})", + (void*)this, + get_oid(), + get_use_count()); + } + using obc_accessing_option_t = boost::intrusive::member_hook< ObjectContext, boost::intrusive::list_member_hook<>, &ObjectContext::obc_accessing_hook>; - template<RWState::State Type, typename InterruptCond = void, typename Func> - auto with_lock(Func&& func) { - if constexpr (!std::is_void_v<InterruptCond>) { - auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func)); - switch (Type) { - case RWState::RWWRITE: - return _with_lock(lock.for_write(), std::move(wrapper)); - case RWState::RWREAD: - return _with_lock(lock.for_read(), std::move(wrapper)); - case RWState::RWEXCL: - return _with_lock(lock.for_excl(), std::move(wrapper)); - case RWState::RWNONE: - return seastar::futurize_invoke(std::move(wrapper)); - default: - assert(0 == "noop"); - } - } else { - switch (Type) { - case RWState::RWWRITE: - return _with_lock(lock.for_write(), std::forward<Func>(func)); - case RWState::RWREAD: - return _with_lock(lock.for_read(), std::forward<Func>(func)); - case RWState::RWEXCL: - return _with_lock(lock.for_excl(), std::forward<Func>(func)); - case RWState::RWNONE: - return seastar::futurize_invoke(std::forward<Func>(func)); - default: - assert(0 == "noop"); - } - } - } - - /** - * load_then_with_lock - * - * Takes two functions as arguments -- load_func to be invoked - * with an exclusive lock, and func to be invoked under the - * lock type specified by the Type template argument. - * - * Caller must ensure that *this is not already locked, presumably - * by invoking load_then_with_lock immediately after construction. - * - * @param [in] load_func Function to be invoked under excl lock - * @param [in] func Function to be invoked after load_func under - * lock of type Type. - */ - template<RWState::State Type, typename Func, typename Func2> - auto load_then_with_lock(Func &&load_func, Func2 &&func) { - class lock_state_t { - tri_mutex *lock = nullptr; - bool excl = false; - - public: - lock_state_t(tri_mutex &lock) : lock(&lock), excl(true) { - ceph_assert(lock.try_lock_for_excl()); - } - lock_state_t(lock_state_t &&o) : lock(o.lock), excl(o.excl) { - o.lock = nullptr; - o.excl = false; - } - lock_state_t() = delete; - lock_state_t &operator=(lock_state_t &&o) = delete; - lock_state_t(const lock_state_t &o) = delete; - lock_state_t &operator=(const lock_state_t &o) = delete; - - void demote() { - ceph_assert(excl); - ceph_assert(lock); - if constexpr (Type == RWState::RWWRITE) { - lock->demote_to_write(); - } else if constexpr (Type == RWState::RWREAD) { - lock->demote_to_read(); - } else if constexpr (Type == RWState::RWNONE) { - lock->unlock_for_excl(); - } - excl = false; - } - - ~lock_state_t() { - if (!lock) - return; - - if constexpr (Type == RWState::RWEXCL) { - lock->unlock_for_excl(); - } else { - if (excl) { - lock->unlock_for_excl(); - return; - } - - if constexpr (Type == RWState::RWWRITE) { - lock->unlock_for_write(); - } else if constexpr (Type == RWState::RWREAD) { - lock->unlock_for_read(); - } - } - } - }; - - return seastar::do_with( - lock_state_t{lock}, - [load_func=std::move(load_func), func=std::move(func)](auto &ls) mutable { - return std::invoke( - std::move(load_func) - ).si_then([func=std::move(func), &ls]() mutable { - ls.demote(); - return std::invoke(std::move(func)); - }); - }); - } - bool empty() const { return !lock.is_acquired(); } @@ -313,12 +233,14 @@ public: void clear_range(const hobject_t &from, const hobject_t &to) { - obc_lru.clear_range(from, to); + obc_lru.clear_range(from, to, [](auto &obc) { + obc.invalidated = true; + }); } void invalidate_on_interval_change() { obc_lru.clear([](auto &obc) { - obc.invalidated_by_interval_change = true; + obc.invalidated = true; }); } @@ -336,3 +258,6 @@ std::optional<hobject_t> resolve_oid(const SnapSet &ss, const hobject_t &oid); } // namespace crimson::osd + +template <> +struct fmt::formatter<RWState::State> : fmt::ostream_formatter {}; diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc index 12aa40b925a..483251a23b5 100644 --- a/src/crimson/osd/object_context_loader.cc +++ b/src/crimson/osd/object_context_loader.cc @@ -1,3 +1,4 @@ +#include "crimson/common/coroutine.h" #include "crimson/osd/object_context_loader.h" #include "osd/osd_types_fmt.h" #include "osd/object_state_fmt.h" @@ -8,207 +9,162 @@ namespace crimson::osd { using crimson::common::local_conf; - template<RWState::State State> - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_head_obc(const hobject_t& oid, - with_obc_func_t&& func) - { - return with_locked_obc<State, true /* track */>( - oid, - [func=std::move(func)](auto obc) { - // The template with_obc_func_t wrapper supports two obcs (head and clone). - // In the 'with_head_obc' case, however, only the head is in use. - // Pass the same head obc twice in order to - // to support the generic with_obc sturcture. - return std::invoke(std::move(func), obc, obc); - }); + +ObjectContextLoader::load_and_lock_fut +ObjectContextLoader::load_and_lock_head(Manager &manager, RWState::State lock_type) +{ + LOG_PREFIX(ObjectContextLoader::load_and_lock_head); + DEBUGDPP("{} {}", dpp, manager.target, lock_type); + auto releaser = manager.get_releaser(); + ceph_assert(manager.target.is_head()); + + if (manager.head_state.is_empty()) { + auto [obc, _] = obc_registry.get_cached_obc(manager.target); + manager.set_state_obc(manager.head_state, obc); + } + ceph_assert(manager.target_state.is_empty()); + manager.set_state_obc(manager.target_state, manager.head_state.obc); + + if (manager.target_state.obc->loading_started) { + co_await manager.target_state.lock_to(lock_type); + } else { + manager.target_state.lock_excl_sync(); + manager.target_state.obc->loading_started = true; + co_await load_obc(manager.target_state.obc); + manager.target_state.demote_excl_to(lock_type); } + releaser.cancel(); +} + +ObjectContextLoader::load_and_lock_fut +ObjectContextLoader::load_and_lock_clone( + Manager &manager, RWState::State lock_type, bool lock_head) +{ + LOG_PREFIX(ObjectContextLoader::load_and_lock_clone); + DEBUGDPP("{} {}", dpp, manager.target, lock_type); + auto releaser = manager.get_releaser(); - template<RWState::State State> - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_clone_obc(const hobject_t& oid, - with_obc_func_t&& func, - bool resolve_clone) - { - LOG_PREFIX(ObjectContextLoader::with_clone_obc); - assert(!oid.is_head()); - return with_head_obc<RWState::RWREAD>( - oid.get_head(), - [FNAME, oid, func=std::move(func), resolve_clone, this] - (auto head, auto) mutable -> load_obc_iertr::future<> { - if (!head->obs.exists) { - ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid); - return load_obc_iertr::future<>{ - crimson::ct_error::enoent::make() - }; - } - return this->with_clone_obc_only<State>(std::move(head), - oid, - std::move(func), - resolve_clone); - }); + ceph_assert(!manager.target.is_head()); + ceph_assert(manager.target_state.is_empty()); + + if (manager.head_state.is_empty()) { + auto [obc, _] = obc_registry.get_cached_obc(manager.target.get_head()); + manager.set_state_obc(manager.head_state, obc); } - template<RWState::State State> - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_clone_obc_only(ObjectContextRef head, - hobject_t clone_oid, - with_obc_func_t&& func, - bool resolve_clone) - { - LOG_PREFIX(ObjectContextLoader::with_clone_obc_only); - DEBUGDPP("{}", dpp, clone_oid); - assert(!clone_oid.is_head()); - if (resolve_clone) { - auto resolved_oid = resolve_oid(head->get_head_ss(), clone_oid); - if (!resolved_oid) { - ERRORDPP("clone {} not found", dpp, clone_oid); - return load_obc_iertr::future<>{ - crimson::ct_error::enoent::make() - }; - } - if (resolved_oid->is_head()) { - // See resolve_oid - return std::move(func)(head, head); - } - clone_oid = *resolved_oid; - } - return with_locked_obc<State, false /* don't track */>( - clone_oid, - [head=std::move(head), func=std::move(func)](auto clone) { - clone->set_clone_ssc(head->ssc); - return std::move(func)(std::move(head), std::move(clone)); - }); + if (!manager.head_state.obc->loading_started) { + // caller is responsible for pre-populating a loaded obc if lock_head is + // false + ceph_assert(lock_head); + manager.head_state.lock_excl_sync(); + manager.head_state.obc->loading_started = true; + co_await load_obc(manager.head_state.obc); + manager.head_state.demote_excl_to(RWState::RWREAD); + } else if (lock_head) { + co_await manager.head_state.lock_to(RWState::RWREAD); } - template<RWState::State State> - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_obc(hobject_t oid, - with_obc_func_t&& func, - bool resolve_clone) - { - if (oid.is_head()) { - return with_head_obc<State>(oid, std::move(func)); - } else { - return with_clone_obc<State>(oid, std::move(func), resolve_clone); + if (manager.options.resolve_clone) { + auto resolved_oid = resolve_oid( + manager.head_state.obc->get_head_ss(), + manager.target); + if (!resolved_oid) { + ERRORDPP("clone {} not found", dpp, manager.target); + co_await load_obc_iertr::future<>( + crimson::ct_error::enoent::make() + ); } + // note: might be head if snap was taken after most recent write! + manager.target = *resolved_oid; } - template<RWState::State State, bool track, typename Func> - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_locked_obc(const hobject_t& oid, - Func&& func) - { - LOG_PREFIX(ObjectContextLoader::with_locked_obc); - auto [obc, existed] = obc_registry.get_cached_obc(oid); - DEBUGDPP("object {} existed {}", - dpp, obc->get_oid(), existed); - if constexpr (track) { - obc->append_to(obc_set_accessing); + if (manager.target.is_head()) { + /* Yes, we assert at the top that manager.target is not head. However, it's + * possible that the requested snap (the resolve_clone path above) actually + * maps to head (a read on an rbd snapshot more recent than the most recent + * write on this specific rbd block, for example). + * + * In such an event, it's hypothetically possible that lock_type isn't + * RWREAD, in which case we need to drop and reacquire the lock. However, + * this case is at present impossible. Actual client requests cannot write + * to a snapshot and will therefore always be RWREAD. The pathways that + * actually can mutate a clone do not set resolve_clone, so target will not + * become head here. + */ + manager.set_state_obc(manager.target_state, manager.head_state.obc); + if (lock_type != manager.head_state.state) { + // This case isn't actually possible at the moment for the above reason. + manager.head_state.release_lock(); + co_await manager.target_state.lock_to(lock_type); + } else { + manager.target_state.state = manager.head_state.state; + manager.head_state.state = RWState::RWNONE; } - if (existed) { - return obc->with_lock<State, IOInterruptCondition>( - [func=std::move(func), obc=ObjectContextRef(obc)] { - return std::invoke(std::move(func), obc); - } - ).finally([FNAME, this, obc=ObjectContextRef(obc)] { - DEBUGDPP("released object {}, {}", dpp, obc->get_oid(), obc->obs); - if constexpr (track) { - obc->remove_from(obc_set_accessing); - } - }); + } else { + auto [obc, _] = obc_registry.get_cached_obc(manager.target); + manager.set_state_obc(manager.target_state, obc); + + if (manager.target_state.obc->loading_started) { + co_await manager.target_state.lock_to(RWState::RWREAD); } else { - return obc->load_then_with_lock<State> ( - [this, obc=ObjectContextRef(obc)] { - return load_obc(obc); - }, - [func=std::move(func), obc=ObjectContextRef(obc)] { - return std::invoke(std::move(func), obc); - } - ).finally([FNAME, this, obc=ObjectContextRef(obc)] { - DEBUGDPP("released object {}, {}", dpp, obc->get_oid(), obc->obs); - if constexpr (track) { - obc->remove_from(obc_set_accessing); - } - }); + manager.target_state.lock_excl_sync(); + manager.target_state.obc->loading_started = true; + co_await load_obc(manager.target_state.obc); + manager.target_state.obc->set_clone_ssc(manager.head_state.obc->ssc); + manager.target_state.demote_excl_to(RWState::RWREAD); } } + releaser.cancel(); +} + +ObjectContextLoader::load_and_lock_fut +ObjectContextLoader::load_and_lock(Manager &manager, RWState::State lock_type) +{ + LOG_PREFIX(ObjectContextLoader::load_and_lock); + DEBUGDPP("{} {}", dpp, manager.target, lock_type); + if (manager.target.is_head()) { + return load_and_lock_head(manager, lock_type); + } else { + return load_and_lock_clone(manager, lock_type); + } +} - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::load_obc(ObjectContextRef obc) - { - LOG_PREFIX(ObjectContextLoader::load_obc); - return backend.load_metadata(obc->get_oid()) +ObjectContextLoader::load_obc_iertr::future<> +ObjectContextLoader::load_obc(ObjectContextRef obc) +{ + LOG_PREFIX(ObjectContextLoader::load_obc); + return backend.load_metadata(obc->get_oid()) .safe_then_interruptible( [FNAME, this, obc=std::move(obc)](auto md) -> load_obc_ertr::future<> { - const hobject_t& oid = md->os.oi.soid; - DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid); - if (oid.is_head()) { - if (!md->ssc) { - ERRORDPP("oid {} missing snapsetcontext", dpp, oid); - return crimson::ct_error::object_corrupted::make(); - } - obc->set_head_state(std::move(md->os), - std::move(md->ssc)); - } else { - // we load and set the ssc only for head obc. - // For clones, the head's ssc will be referenced later. - // See set_clone_ssc - obc->set_clone_state(std::move(md->os)); - } - DEBUGDPP("loaded obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid); - return seastar::now(); - }); - } - - ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::reload_obc(ObjectContext& obc) const - { - LOG_PREFIX(ObjectContextLoader::reload_obc); - assert(obc.is_head()); - return backend.load_metadata(obc.get_oid()) - .safe_then_interruptible<false>( - [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> { - DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid()); - if (!md->ssc) { - ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid()); - return crimson::ct_error::object_corrupted::make(); - } - obc.set_head_state(std::move(md->os), std::move(md->ssc)); - return load_obc_ertr::now(); - }); - } + const hobject_t& oid = md->os.oi.soid; + DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid); + if (oid.is_head()) { + if (!md->ssc) { + ERRORDPP("oid {} missing snapsetcontext", dpp, oid); + return crimson::ct_error::object_corrupted::make(); + } + obc->set_head_state(std::move(md->os), + std::move(md->ssc)); + } else { + // we load and set the ssc only for head obc. + // For clones, the head's ssc will be referenced later. + // See set_clone_ssc + obc->set_clone_state(std::move(md->os)); + } + DEBUGDPP("loaded obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid); + return seastar::now(); + }); +} - void ObjectContextLoader::notify_on_change(bool is_primary) - { - LOG_PREFIX(ObjectContextLoader::notify_on_change); - DEBUGDPP("is_primary: {}", dpp, is_primary); - for (auto& obc : obc_set_accessing) { - DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid()); - obc.interrupt(::crimson::common::actingset_changed(is_primary)); - } +void ObjectContextLoader::notify_on_change(bool is_primary) +{ + LOG_PREFIX(ObjectContextLoader::notify_on_change); + DEBUGDPP("is_primary: {}", dpp, is_primary); + for (auto& obc : obc_set_accessing) { + DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid()); + obc.interrupt(::crimson::common::actingset_changed(is_primary)); } - - // explicitly instantiate the used instantiations - template ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t, - with_obc_func_t&&, - bool resolve_clone); - - template ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t, - with_obc_func_t&&, - bool resolve_clone); - - template ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t, - with_obc_func_t&&, - bool resolve_clone); - - template ObjectContextLoader::load_obc_iertr::future<> - ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t, - with_obc_func_t&&, - bool resolve_clone); +} } diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h index 277708eca4f..49f8f1572bf 100644 --- a/src/crimson/osd/object_context_loader.h +++ b/src/crimson/osd/object_context_loader.h @@ -1,9 +1,14 @@ #pragma once #include <seastar/core/future.hh> +#include <seastar/util/defer.hh> +#include "crimson/common/coroutine.h" #include "crimson/common/errorator.h" +#include "crimson/common/log.h" #include "crimson/osd/object_context.h" +#include "crimson/osd/osd_operation.h" #include "crimson/osd/pg_backend.h" +#include "osd/object_state_fmt.h" namespace crimson::osd { class ObjectContextLoader { @@ -29,6 +34,208 @@ public: ::crimson::osd::IOInterruptCondition, load_obc_ertr>; + class Manager { + ObjectContextLoader &loader; + hobject_t target; + + Manager() = delete; + template <typename T> + Manager(ObjectContextLoader &loader, T &&t) + : loader(loader), target(std::forward<T>(t)) {} + Manager(const Manager &) = delete; + Manager &operator=(const Manager &o) = delete; + + struct options_t { + bool resolve_clone = true; + } options; + + struct state_t { + RWState::State state = RWState::RWNONE; + ObjectContextRef obc; + bool is_empty() const { return !obc; } + + void lock_excl_sync() { + bool locked = obc->lock.try_lock_for_excl(); + ceph_assert(locked); + state = RWState::RWEXCL; + } + + void demote_excl_to(RWState::State lock_type) { + assert(state == RWState::RWEXCL); + switch (lock_type) { + case RWState::RWWRITE: + obc->lock.demote_to_write(); + state = RWState::RWWRITE; + break; + case RWState::RWREAD: + obc->lock.demote_to_read(); + state = RWState::RWREAD; + break; + case RWState::RWNONE: + obc->lock.unlock_for_excl(); + state = RWState::RWNONE; + break; + case RWState::RWEXCL: + //noop + break; + default: + ceph_assert(0 == "impossible"); + } + } + + auto lock_to(RWState::State lock_type) { + assert(state == RWState::RWNONE); + switch (lock_type) { + case RWState::RWWRITE: + return interruptor::make_interruptible( + obc->lock.lock_for_write().then([this] { + state = RWState::RWWRITE; + })); + case RWState::RWREAD: + return interruptor::make_interruptible( + obc->lock.lock_for_read().then([this] { + state = RWState::RWREAD; + })); + case RWState::RWNONE: + // noop + return interruptor::now(); + case RWState::RWEXCL: + return interruptor::make_interruptible( + obc->lock.lock_for_excl().then([this] { + state = RWState::RWEXCL; + })); + default: + ceph_assert(0 == "impossible"); + return interruptor::now(); + } + } + + void release_lock() { + switch (state) { + case RWState::RWREAD: + obc->lock.unlock_for_read(); + break; + case RWState::RWWRITE: + obc->lock.unlock_for_write(); + break; + case RWState::RWEXCL: + obc->lock.unlock_for_excl(); + break; + case RWState::RWNONE: + // noop + break; + default: + ceph_assert(0 == "invalid"); + } + state = RWState::RWNONE; + } + }; + state_t head_state; + state_t target_state; + + friend ObjectContextLoader; + + void set_state_obc(state_t &s, ObjectContextRef _obc) { + s.obc = std::move(_obc); + s.obc->append_to(loader.obc_set_accessing); + } + + void release_state(state_t &s) { + LOG_PREFIX(ObjectContextLoader::release_state); + if (s.is_empty()) return; + + s.release_lock(); + SUBDEBUGDPP(osd, "releasing obc {}, {}", loader.dpp, *(s.obc), s.obc->obs); + s.obc->remove_from(loader.obc_set_accessing); + s = state_t(); + } + public: + Manager(Manager &&rhs) : loader(rhs.loader) { + std::swap(target, rhs.target); + std::swap(options, rhs.options); + std::swap(head_state, rhs.head_state); + std::swap(target_state, rhs.target_state); + } + + Manager &operator=(Manager &&o) { + this->~Manager(); + new(this) Manager(std::move(o)); + return *this; + } + + ObjectContextRef &get_obc() { + ceph_assert(!target_state.is_empty()); + ceph_assert(target_state.obc->is_loaded()); + return target_state.obc; + } + + ObjectContextRef &get_head_obc() { + ceph_assert(!head_state.is_empty()); + ceph_assert(head_state.obc->is_loaded()); + return head_state.obc; + } + + void release() { + release_state(head_state); + release_state(target_state); + } + + auto get_releaser() { + return seastar::defer([this] { + release(); + }); + } + + ~Manager() { + release(); + } + }; + + class Orderer { + friend ObjectContextLoader; + ObjectContextRef orderer_obc; + public: + CommonOBCPipeline &obc_pp() { + ceph_assert(orderer_obc); + return orderer_obc->obc_pipeline; + } + + ~Orderer() { + LOG_PREFIX(ObjectContextLoader::~Orderer); + SUBDEBUG(osd, "releasing obc {}, {}", *(orderer_obc)); + } + }; + + Orderer get_obc_orderer(const hobject_t &oid) { + Orderer ret; + std::tie(ret.orderer_obc, std::ignore) = + obc_registry.get_cached_obc(oid.get_head()); + return ret; + } + + Manager get_obc_manager(const hobject_t &oid, bool resolve_clone = true) { + Manager ret(*this, oid); + ret.options.resolve_clone = resolve_clone; + return ret; + } + + Manager get_obc_manager( + Orderer &orderer, const hobject_t &oid, bool resolve_clone = true) { + Manager ret = get_obc_manager(oid, resolve_clone); + ret.set_state_obc(ret.head_state, orderer.orderer_obc); + return ret; + } + + using load_and_lock_ertr = load_obc_ertr; + using load_and_lock_iertr = interruptible::interruptible_errorator< + IOInterruptCondition, load_and_lock_ertr>; + using load_and_lock_fut = load_and_lock_iertr::future<>; +private: + load_and_lock_fut load_and_lock_head(Manager &, RWState::State); + load_and_lock_fut load_and_lock_clone(Manager &, RWState::State, bool lock_head=true); +public: + load_and_lock_fut load_and_lock(Manager &, RWState::State); + using interruptor = ::crimson::interruptible::interruptor< ::crimson::osd::IOInterruptCondition>; @@ -43,8 +250,13 @@ public: // See SnapTrimObjSubEvent::remove_or_update - in_removed_snaps_queue usage. template<RWState::State State> load_obc_iertr::future<> with_obc(hobject_t oid, - with_obc_func_t&& func, - bool resolve_clone = true); + with_obc_func_t func, + bool resolve_clone = true) { + auto manager = get_obc_manager(oid, resolve_clone); + co_await load_and_lock(manager, State); + co_await std::invoke( + func, manager.get_head_obc(), manager.get_obc()); + } // Use this variant in the case where the head object // obc is already locked and only the clone obc is needed. @@ -53,10 +265,20 @@ public: template<RWState::State State> load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head, hobject_t clone_oid, - with_obc_func_t&& func, - bool resolve_clone = true); - - load_obc_iertr::future<> reload_obc(ObjectContext& obc) const; + with_obc_func_t func, + bool resolve_clone = true) { + LOG_PREFIX(ObjectContextLoader::with_clone_obc_only); + SUBDEBUGDPP(osd, "{}", dpp, clone_oid); + auto manager = get_obc_manager(clone_oid, resolve_clone); + // We populate head_state here with the passed obc assuming that + // it has been loaded and locked appropriately. We do not populate + // head_state.state because we won't be taking or releasing any + // locks on head as part of this call. + manager.head_state.obc = head; + manager.head_state.obc->append_to(obc_set_accessing); + co_await load_and_lock_clone(manager, State, false); + co_await std::invoke(func, head, manager.get_obc()); + } void notify_on_change(bool is_primary); @@ -66,24 +288,9 @@ private: DoutPrefixProvider& dpp; obc_accessing_list_t obc_set_accessing; - template<RWState::State State> - load_obc_iertr::future<> with_clone_obc(const hobject_t& oid, - with_obc_func_t&& func, - bool resolve_clone); - - template<RWState::State State> - load_obc_iertr::future<> with_head_obc(const hobject_t& oid, - with_obc_func_t&& func); - - template<RWState::State State, bool track, typename Func> - load_obc_iertr::future<> with_locked_obc(const hobject_t& oid, - Func&& func); - - template<RWState::State State> - load_obc_iertr::future<ObjectContextRef> - get_or_load_obc(ObjectContextRef obc, - bool existed); - load_obc_iertr::future<> load_obc(ObjectContextRef obc); }; + +using ObjectContextManager = ObjectContextLoader::Manager; + } diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc index 9bf60140374..cbc35c21a04 100644 --- a/src/crimson/osd/ops_executer.cc +++ b/src/crimson/osd/ops_executer.cc @@ -15,12 +15,15 @@ #include <seastar/core/thread.hh> +#include "crimson/common/log.h" #include "crimson/osd/exceptions.h" #include "crimson/osd/pg.h" #include "crimson/osd/watch.h" #include "osd/ClassHandler.h" #include "osd/SnapMapper.h" +SET_SUBSYS(osd); + namespace { seastar::logger& logger() { return crimson::get_logger(ceph_subsys_osd); @@ -464,10 +467,7 @@ auto OpsExecuter::do_const_op(Func&& f) { template <class Func> auto OpsExecuter::do_write_op(Func&& f, OpsExecuter::modified_by m) { ++num_write; - if (!osd_op_params) { - osd_op_params.emplace(); - fill_op_params(m); - } + check_init_op_params(m); return std::forward<Func>(f)(pg->get_backend(), obc->obs, txn); } OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver( @@ -822,25 +822,100 @@ OpsExecuter::do_execute_op(OSDOp& osd_op) } } -void OpsExecuter::fill_op_params(OpsExecuter::modified_by m) +OpsExecuter::rep_op_fut_t +OpsExecuter::flush_changes_and_submit( + const std::vector<OSDOp>& ops, + SnapMapper& snap_mapper, + OSDriver& osdriver) { - osd_op_params->req_id = msg->get_reqid(); - osd_op_params->mtime = msg->get_mtime(); - osd_op_params->at_version = pg->get_next_version(); - osd_op_params->pg_trim_to = pg->get_pg_trim_to(); - osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk(); - osd_op_params->last_complete = pg->get_info().last_complete; - osd_op_params->user_modify = (m == modified_by::user); + const bool want_mutate = !txn.empty(); + // osd_op_params are instantiated by every wr-like operation. + assert(osd_op_params || !want_mutate); + assert(obc); + + auto submitted = interruptor::now(); + auto all_completed = interruptor::now(); + + if (cloning_ctx) { + ceph_assert(want_mutate); + } + + apply_stats(); + if (want_mutate) { + osd_op_params->at_version = pg->get_next_version(); + osd_op_params->pg_trim_to = pg->get_pg_trim_to(); + osd_op_params->pg_committed_to = pg->get_pg_committed_to(); + osd_op_params->last_complete = pg->get_info().last_complete; + + std::vector<pg_log_entry_t> log_entries; + + if (cloning_ctx) { + log_entries.emplace_back(complete_cloning_ctx()); + } + + log_entries.emplace_back(prepare_head_update(ops, txn)); + + if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) { + ceph_assert(log_rit->version == osd_op_params->at_version); + } + + /* + * This works around the gcc bug causing the generated code to incorrectly + * execute unconditionally before the predicate. + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101244 + */ + auto clone_obc = cloning_ctx + ? std::move(cloning_ctx->clone_obc) + : nullptr; + auto [_submitted, _all_completed] = co_await pg->submit_transaction( + std::move(obc), + std::move(clone_obc), + std::move(txn), + std::move(*osd_op_params), + std::move(log_entries) + ); + + submitted = std::move(_submitted); + all_completed = std::move(_all_completed); + } + + if (op_effects.size()) [[unlikely]] { + // need extra ref pg due to apply_stats() which can be executed after + // informing snap mapper + all_completed = + std::move(all_completed).then_interruptible([this, pg=this->pg] { + // let's do the cleaning of `op_effects` in destructor + return interruptor::do_for_each(op_effects, + [pg=std::move(pg)](auto& op_effect) { + return op_effect->execute(pg); + }); + }); + } + + co_return std::make_tuple( + std::move(submitted), + std::move(all_completed)); } -std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction( - const std::vector<OSDOp>& ops) +pg_log_entry_t OpsExecuter::prepare_head_update( + const std::vector<OSDOp>& ops, + ceph::os::Transaction &txn) { - // let's ensure we don't need to inform SnapMapper about this particular - // entry. + LOG_PREFIX(OpsExecuter::prepare_head_update); assert(obc->obs.oi.soid.snap >= CEPH_MAXSNAP); - std::vector<pg_log_entry_t> log_entries; - log_entries.emplace_back( + + update_clone_overlap(); + if (cloning_ctx) { + obc->ssc->snapset = std::move(cloning_ctx->new_snapset); + } + if (snapc.seq > obc->ssc->snapset.seq) { + // update snapset with latest snap context + obc->ssc->snapset.seq = snapc.seq; + obc->ssc->snapset.snaps.clear(); + } + + pg_log_entry_t ret{ obc->obs.exists ? pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE, obc->obs.oi.soid, @@ -849,15 +924,38 @@ std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction( osd_op_params->user_modify ? osd_op_params->at_version.version : 0, osd_op_params->req_id, osd_op_params->mtime, - op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0); + op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0}; + if (op_info.allows_returnvec()) { // also the per-op values are recorded in the pg log - log_entries.back().set_op_returns(ops); - logger().debug("{} op_returns: {}", - __func__, log_entries.back().op_returns); + ret.set_op_returns(ops); + DEBUGDPP("op returns: {}", *pg, ret.op_returns); + } + ret.clean_regions = std::move(osd_op_params->clean_regions); + + + if (obc->obs.exists) { + obc->obs.oi.prior_version = obc->obs.oi.version; + obc->obs.oi.version = osd_op_params->at_version; + if (osd_op_params->user_modify) + obc->obs.oi.user_version = osd_op_params->at_version.version; + obc->obs.oi.last_reqid = osd_op_params->req_id; + obc->obs.oi.mtime = osd_op_params->mtime; + obc->obs.oi.local_mtime = ceph_clock_now(); + + obc->ssc->exists = true; + pg->get_backend().set_metadata( + obc->obs.oi.soid, + obc->obs.oi, + obc->obs.oi.soid.is_head() ? &(obc->ssc->snapset) : nullptr, + txn); + } else { + // reset cached ObjectState without enforcing eviction + obc->obs.oi = object_info_t(obc->obs.oi.soid); } - log_entries.back().clean_regions = std::move(osd_op_params->clean_regions); - return log_entries; + + DEBUGDPP("entry: {}", *pg, ret); + return ret; } // Defined here because there is a circular dependency between OpsExecuter and PG @@ -871,25 +969,26 @@ version_t OpsExecuter::get_last_user_version() const return pg->get_last_user_version(); } -std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone( +void OpsExecuter::prepare_cloning_ctx( const SnapContext& snapc, const ObjectState& initial_obs, const SnapSet& initial_snapset, PGBackend& backend, ceph::os::Transaction& txn) { + LOG_PREFIX(OpsExecuter::prepare_cloning_ctx); const hobject_t& soid = initial_obs.oi.soid; logger().debug("{} {} snapset={} snapc={}", __func__, soid, initial_snapset, snapc); - auto cloning_ctx = std::make_unique<CloningContext>(); + cloning_ctx = std::make_unique<CloningContext>(); cloning_ctx->new_snapset = initial_snapset; // clone object, the snap field is set to the seq of the SnapContext // at its creation. - hobject_t coid = soid; - coid.snap = snapc.seq; + cloning_ctx->coid = soid; + cloning_ctx->coid.snap = snapc.seq; // existing snaps are stored in descending order in snapc, // cloned_snaps vector will hold all the snaps stored until snapset.seq @@ -900,48 +999,63 @@ std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone( return std::vector<snapid_t>{std::begin(snapc.snaps), last}; }(); - auto clone_obc = prepare_clone(coid, osd_op_params->at_version); - osd_op_params->at_version.version++; + // make clone here, but populate in metadata in complete_cloning_ctx + backend.clone_for_write(soid, cloning_ctx->coid, txn); - // make clone - backend.clone(clone_obc->obs.oi, initial_obs, clone_obc->obs, txn); + cloning_ctx->clone_obc = prepare_clone(cloning_ctx->coid, initial_obs); delta_stats.num_objects++; - if (clone_obc->obs.oi.is_omap()) { + if (cloning_ctx->clone_obc->obs.oi.is_omap()) { delta_stats.num_objects_omap++; } delta_stats.num_object_clones++; // newsnapset is obc's ssc - cloning_ctx->new_snapset.clones.push_back(coid.snap); - cloning_ctx->new_snapset.clone_size[coid.snap] = initial_obs.oi.size; - cloning_ctx->new_snapset.clone_snaps[coid.snap] = cloned_snaps; + cloning_ctx->new_snapset.clones.push_back(cloning_ctx->coid.snap); + cloning_ctx->new_snapset.clone_size[cloning_ctx->coid.snap] = initial_obs.oi.size; + cloning_ctx->new_snapset.clone_snaps[cloning_ctx->coid.snap] = cloned_snaps; // clone_overlap should contain an entry for each clone // (an empty interval_set if there is no overlap) - auto &overlap = cloning_ctx->new_snapset.clone_overlap[coid.snap]; + auto &overlap = cloning_ctx->new_snapset.clone_overlap[cloning_ctx->coid.snap]; if (initial_obs.oi.size) { overlap.insert(0, initial_obs.oi.size); } // log clone - logger().debug("cloning v {} to {} v {} snaps={} snapset={}", - initial_obs.oi.version, coid, - osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset); + DEBUGDPP("cloning v {} to {} v {} snaps={} snapset={}", *pg, + initial_obs.oi.version, cloning_ctx->coid, + osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset); +} - cloning_ctx->log_entry = { +pg_log_entry_t OpsExecuter::complete_cloning_ctx() +{ + ceph_assert(cloning_ctx); + const auto &coid = cloning_ctx->coid; + cloning_ctx->clone_obc->obs.oi.version = osd_op_params->at_version; + + osd_op_params->at_version.version++; + + pg->get_backend().set_metadata( + cloning_ctx->coid, + cloning_ctx->clone_obc->obs.oi, + nullptr /* snapset */, + txn); + + pg_log_entry_t ret{ pg_log_entry_t::CLONE, coid, - clone_obc->obs.oi.version, - clone_obc->obs.oi.prior_version, - clone_obc->obs.oi.user_version, + cloning_ctx->clone_obc->obs.oi.version, + cloning_ctx->clone_obc->obs.oi.prior_version, + cloning_ctx->clone_obc->obs.oi.user_version, osd_reqid_t(), - clone_obc->obs.oi.mtime, // will be replaced in `apply_to()` + cloning_ctx->clone_obc->obs.oi.mtime, // will be replaced in `apply_to()` 0 }; - encode(cloned_snaps, cloning_ctx->log_entry.snaps); - cloning_ctx->log_entry.clean_regions.mark_data_region_dirty(0, initial_obs.oi.size); - - return cloning_ctx; + ceph_assert(cloning_ctx->new_snapset.clone_snaps.count(coid.snap)); + encode(cloning_ctx->new_snapset.clone_snaps[coid.snap], ret.snaps); + ret.clean_regions.mark_data_region_dirty(0, cloning_ctx->clone_obc->obs.oi.size); + ret.mtime = cloning_ctx->clone_obc->obs.oi.mtime; + return ret; } void OpsExecuter::update_clone_overlap() { @@ -964,47 +1078,16 @@ void OpsExecuter::update_clone_overlap() { delta_stats.num_bytes += osd_op_params->modified_ranges.size(); } -void OpsExecuter::CloningContext::apply_to( - std::vector<pg_log_entry_t>& log_entries, - ObjectContext& processed_obc) && -{ - log_entry.mtime = processed_obc.obs.oi.mtime; - log_entries.insert(log_entries.begin(), std::move(log_entry)); - processed_obc.ssc->snapset = std::move(new_snapset); -} - -std::vector<pg_log_entry_t> -OpsExecuter::flush_clone_metadata( - std::vector<pg_log_entry_t>&& log_entries, - SnapMapper& snap_mapper, - OSDriver& osdriver, - ceph::os::Transaction& txn) -{ - assert(!txn.empty()); - update_clone_overlap(); - if (cloning_ctx) { - std::move(*cloning_ctx).apply_to(log_entries, *obc); - } - if (snapc.seq > obc->ssc->snapset.seq) { - // update snapset with latest snap context - obc->ssc->snapset.seq = snapc.seq; - obc->ssc->snapset.snaps.clear(); - } - logger().debug("{} done, initial snapset={}, new snapset={}", - __func__, obc->obs.oi.soid, obc->ssc->snapset); - return std::move(log_entries); -} - ObjectContextRef OpsExecuter::prepare_clone( const hobject_t& coid, - eversion_t version) + const ObjectState& initial_obs) { ceph_assert(pg->is_primary()); ObjectState clone_obs{coid}; clone_obs.exists = true; - clone_obs.oi.version = version; - clone_obs.oi.prior_version = obc->obs.oi.version; - clone_obs.oi.copy_user_bits(obc->obs.oi); + // clone_obs.oi.version will be populated in complete_cloning_ctx + clone_obs.oi.prior_version = initial_obs.oi.version; + clone_obs.oi.copy_user_bits(initial_obs.oi); clone_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT); auto [clone_obc, existed] = pg->obc_registry.get_cached_obc(std::move(coid)); @@ -1035,11 +1118,12 @@ OpsExecuter::OpsExecuter(Ref<PG> pg, { if (op_info.may_write() && should_clone(*obc, snapc)) { do_write_op([this](auto& backend, auto& os, auto& txn) { - cloning_ctx = execute_clone(std::as_const(snapc), - std::as_const(obc->obs), - std::as_const(obc->ssc->snapset), - backend, - txn); + prepare_cloning_ctx( + std::as_const(snapc), + std::as_const(obc->obs), + std::as_const(obc->ssc->snapset), + backend, + txn); }); } } diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index e770e825b32..f5554bd6919 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -195,25 +195,26 @@ private: SnapContext snapc; // writer snap context struct CloningContext { + /// id of new clone, populated in prepare_cloning_ctx + hobject_t coid; + /// new snapset, populated in prepare_cloning_ctx SnapSet new_snapset; - pg_log_entry_t log_entry; - - void apply_to( - std::vector<pg_log_entry_t>& log_entries, - ObjectContext& processed_obc) &&; + /// populated in complete_cloning_ctx + ObjectContextRef clone_obc; }; std::unique_ptr<CloningContext> cloning_ctx; - /** - * execute_clone + * prepare_cloning_ctx * * If snapc contains a snap which occurred logically after the last write * seen by this object (see OpsExecuter::should_clone()), we first need - * make a clone of the object at its current state. execute_clone primes - * txn with that clone operation and returns an - * OpsExecuter::CloningContext which will allow us to fill in the corresponding - * metadata and log_entries once the operations have been processed. + * make a clone of the object at its current state. prepare_cloning_ctx + * primes txn with that clone operation and populates cloning_ctx with + * an obc for the clone and a new snapset reflecting the clone. + * + * complete_cloning_ctx later uses the information from cloning_ctx to + * generate a log entry and object_info versions for the clone. * * Note that this strategy differs from classic, which instead performs this * work at the end and reorders the transaction. See @@ -226,13 +227,15 @@ private: * @param backend [in,out] interface for generating mutations * @param txn [out] transaction for the operation */ - std::unique_ptr<CloningContext> execute_clone( + void prepare_cloning_ctx( const SnapContext& snapc, const ObjectState& initial_obs, const SnapSet& initial_snapset, PGBackend& backend, ceph::os::Transaction& txn); + /// complete clone, populate clone_obc, return log entry + pg_log_entry_t complete_cloning_ctx(); /** * should_clone @@ -263,12 +266,6 @@ private: */ void update_clone_overlap(); - std::vector<pg_log_entry_t> flush_clone_metadata( - std::vector<pg_log_entry_t>&& log_entries, - SnapMapper& snap_mapper, - OSDriver& osdriver, - ceph::os::Transaction& txn); - private: // this gizmo could be wrapped in std::optional for the sake of lazy // initialization. we don't need it for ops that doesn't have effect @@ -399,15 +396,22 @@ public: std::tuple<interruptible_future<>, interruptible_future<>>; using rep_op_fut_t = interruptible_future<rep_op_fut_tuple>; - template <typename MutFunc> - rep_op_fut_t flush_changes_n_do_ops_effects( + rep_op_fut_t flush_changes_and_submit( const std::vector<OSDOp>& ops, SnapMapper& snap_mapper, - OSDriver& osdriver, - MutFunc mut_func) &&; - std::vector<pg_log_entry_t> prepare_transaction( - const std::vector<OSDOp>& ops); - void fill_op_params(modified_by m); + OSDriver& osdriver); + pg_log_entry_t prepare_head_update( + const std::vector<OSDOp>& ops, + ceph::os::Transaction &txn); + + void check_init_op_params(OpsExecuter::modified_by m) { + if (!osd_op_params) { + osd_op_params.emplace(); + osd_op_params->req_id = msg->get_reqid(); + osd_op_params->mtime = msg->get_mtime(); + osd_op_params->user_modify = (m == modified_by::user); + } + } ObjectContextRef get_obc() const { return obc; @@ -442,7 +446,7 @@ public: ObjectContextRef prepare_clone( const hobject_t& coid, - eversion_t version); + const ObjectState& initial_obs); void apply_stats(); }; @@ -484,67 +488,6 @@ auto OpsExecuter::with_effect_on_obc( return std::forward<MainFunc>(main_func)(ctx_ref); } -template <typename MutFunc> -OpsExecuter::rep_op_fut_t -OpsExecuter::flush_changes_n_do_ops_effects( - const std::vector<OSDOp>& ops, - SnapMapper& snap_mapper, - OSDriver& osdriver, - MutFunc mut_func) && -{ - const bool want_mutate = !txn.empty(); - // osd_op_params are instantiated by every wr-like operation. - assert(osd_op_params || !want_mutate); - assert(obc); - - auto submitted = interruptor::now(); - auto all_completed = interruptor::now(); - - if (cloning_ctx) { - ceph_assert(want_mutate); - } - - if (want_mutate) { - auto log_entries = flush_clone_metadata( - prepare_transaction(ops), - snap_mapper, - osdriver, - txn); - - if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) { - ceph_assert(log_rit->version == osd_op_params->at_version); - } - - auto [_submitted, _all_completed] = co_await mut_func( - std::move(txn), - std::move(obc), - std::move(*osd_op_params), - std::move(log_entries)); - - submitted = std::move(_submitted); - all_completed = std::move(_all_completed); - } - - apply_stats(); - - if (op_effects.size()) [[unlikely]] { - // need extra ref pg due to apply_stats() which can be executed after - // informing snap mapper - all_completed = - std::move(all_completed).then_interruptible([this, pg=this->pg] { - // let's do the cleaning of `op_effects` in destructor - return interruptor::do_for_each(op_effects, - [pg=std::move(pg)](auto& op_effect) { - return op_effect->execute(pg); - }); - }); - } - - co_return std::make_tuple( - std::move(submitted), - std::move(all_completed)); -} - template <class Func> struct OpsExecuter::RollbackHelper { void rollback_obc_if_modified(); diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc index 34ad97ceb06..0f19bfd7145 100644 --- a/src/crimson/osd/osd.cc +++ b/src/crimson/osd/osd.cc @@ -504,6 +504,8 @@ seastar::future<> OSD::start() }).then_unpack([this] { return _add_me_to_crush(); }).then([this] { + return _add_device_class(); + }).then([this] { monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0); monc->sub_want("mgrmap", 0, 0); monc->sub_want("osdmap", 0, 0); @@ -608,6 +610,38 @@ seastar::future<> OSD::_send_boot() return monc->send_message(std::move(m)); } +seastar::future<> OSD::_add_device_class() +{ + LOG_PREFIX(OSD::_add_device_class); + if (!local_conf().get_val<bool>("osd_class_update_on_start")) { + co_return; + } + + std::string device_class = co_await store.get_default_device_class(); + if (device_class.empty()) { + WARN("Device class is empty; skipping crush update."); + co_return; + } + + INFO("device_class is {} ", device_class); + + std::string cmd = fmt::format( + R"({{"prefix": "osd crush set-device-class", "class": "{}", "ids": ["{}"]}})", + device_class, stringify(whoami) + ); + + auto [code, message, out] = co_await monc->run_command(std::move(cmd), {}); + if (code) { + // to be caught by crimson/osd/main.cc + WARN("fail to set device_class : {} ({})", message, code); + throw std::runtime_error("fail to set device_class"); + } else { + INFO("device_class was set: {}", message); + } + + co_return; +} + seastar::future<> OSD::_add_me_to_crush() { LOG_PREFIX(OSD::_add_me_to_crush); diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h index d7d54d5d2c3..1a84ccd6a3f 100644 --- a/src/crimson/osd/osd.h +++ b/src/crimson/osd/osd.h @@ -188,6 +188,7 @@ private: seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap); seastar::future<> _send_boot(); seastar::future<> _add_me_to_crush(); + seastar::future<> _add_device_class(); seastar::future<> osdmap_subscribe(version_t epoch, bool force_request); diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h index fd8b049c0bf..394375c1129 100644 --- a/src/crimson/osd/osd_operation.h +++ b/src/crimson/osd/osd_operation.h @@ -50,24 +50,36 @@ struct PGPeeringPipeline { }; struct CommonPGPipeline { - struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> { - static constexpr auto type_name = "CommonPGPipeline:::wait_for_active"; - } wait_for_active; - struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> { - static constexpr auto type_name = "CommonPGPipeline::recover_missing"; - } recover_missing; - struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> { - static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc"; - } check_already_complete_get_obc; - struct LockOBC : OrderedConcurrentPhaseT<LockOBC> { - static constexpr auto type_name = "CommonPGPipeline::lock_obc"; - } lock_obc; + struct WaitPGReady : OrderedConcurrentPhaseT<WaitPGReady> { + static constexpr auto type_name = "CommonPGPipeline:::wait_pg_ready"; + } wait_pg_ready; + struct GetOBC : OrderedExclusivePhaseT<GetOBC> { + static constexpr auto type_name = "CommonPGPipeline:::get_obc"; + } get_obc; +}; + +struct PGRepopPipeline { + struct Process : OrderedExclusivePhaseT<Process> { + static constexpr auto type_name = "PGRepopPipeline::process"; + } process; + struct WaitCommit : OrderedConcurrentPhaseT<WaitCommit> { + static constexpr auto type_name = "PGRepopPipeline::wait_repop"; + } wait_commit; + struct SendReply : OrderedExclusivePhaseT<SendReply> { + static constexpr auto type_name = "PGRepopPipeline::send_reply"; + } send_reply; +}; + +struct CommonOBCPipeline { struct Process : OrderedExclusivePhaseT<Process> { - static constexpr auto type_name = "CommonPGPipeline::process"; + static constexpr auto type_name = "CommonOBCPipeline::process"; } process; struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> { - static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop"; + static constexpr auto type_name = "CommonOBCPipeline::wait_repop"; } wait_repop; + struct SendReply : OrderedExclusivePhaseT<SendReply> { + static constexpr auto type_name = "CommonOBCPipeline::send_reply"; + } send_reply; }; @@ -205,6 +217,9 @@ protected: public: static constexpr bool is_trackable = true; + virtual bool requires_pg() const { + return true; + } }; template <class T> @@ -326,6 +341,18 @@ public: with_throttle_while(std::forward<Args>(args)...), *this); } + // Returns std::nullopt if the throttle is acquired immediately, + // returns the future for the acquiring otherwise + std::optional<seastar::future<>> + try_acquire_throttle_now(crimson::osd::scheduler::params_t params) { + if (!max_in_progress || in_progress < max_in_progress) { + ++in_progress; + --pending; + return std::nullopt; + } + return acquire_throttle(params); + } + private: void dump_detail(Formatter *f) const final; diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h index d2786a95e4d..6a2d7e3ccbd 100644 --- a/src/crimson/osd/osd_operation_external_tracking.h +++ b/src/crimson/osd/osd_operation_external_tracking.h @@ -25,24 +25,23 @@ struct LttngBackend ConnectionPipeline::AwaitMap::BlockingEvent::Backend, ConnectionPipeline::GetPGMapping::BlockingEvent::Backend, PerShardPipeline::CreateOrWaitPG::BlockingEvent::Backend, + CommonPGPipeline::WaitPGReady::BlockingEvent::Backend, + CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent::Backend, + CommonPGPipeline::GetOBC::BlockingEvent::Backend, OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, PGMap::PGCreationBlockingEvent::Backend, - ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend, PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, - ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend, PGActivationBlocker::BlockingEvent::Backend, scrub::PGScrubber::BlockingEvent::Backend, - ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend, - ClientRequest::PGPipeline::RecoverMissing:: - BlockingEvent::ExitBarrierEvent::Backend, - ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend, - ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend, - ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend, - ClientRequest::PGPipeline::Process::BlockingEvent::Backend, - ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend, - ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, - ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend, - ClientRequest::CompletionEvent::Backend + ClientRequest::CompletionEvent::Backend, + CommonOBCPipeline::Process::BlockingEvent::Backend, + CommonOBCPipeline::WaitRepop::BlockingEvent::Backend, + CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, + CommonOBCPipeline::SendReply::BlockingEvent::Backend, + PGRepopPipeline::Process::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent::Backend, + PGRepopPipeline::SendReply::BlockingEvent::Backend { void handle(ClientRequest::StartEvent&, const Operation&) override {} @@ -72,24 +71,28 @@ struct LttngBackend const PerShardPipeline::CreateOrWaitPG& blocker) override { } - void handle(PGMap::PGCreationBlockingEvent&, - const Operation&, - const PGMap::PGCreationBlocker&) override { + void handle(CommonPGPipeline::WaitPGReady::BlockingEvent& ev, + const Operation& op, + const CommonPGPipeline::WaitPGReady& blocker) override { } - void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev, + void handle(CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { + } + + void handle(CommonPGPipeline::GetOBC::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::AwaitMap& blocker) override { + const CommonPGPipeline::GetOBC& blocker) override { } - void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + void handle(PGMap::PGCreationBlockingEvent&, const Operation&, - const PG_OSDMapGate::OSDMapBlocker&) override { + const PGMap::PGCreationBlocker&) override { } - void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev, - const Operation& op, - const ClientRequest::PGPipeline::WaitForActive& blocker) override { + void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const PG_OSDMapGate::OSDMapBlocker&) override { } void handle(PGActivationBlocker::BlockingEvent& ev, @@ -102,51 +105,47 @@ struct LttngBackend const scrub::PGScrubber& blocker) override { } - void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev, + void handle(CommonOBCPipeline::Process::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::RecoverMissing& blocker) override { + const CommonOBCPipeline::Process& blocker) override { } - void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::ExitBarrierEvent& ev, - const Operation& op) override { - } - - void handle(ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent& ev, + void handle(CommonOBCPipeline::WaitRepop::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override { + const CommonOBCPipeline::WaitRepop& blocker) override { } - - void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev, - const Operation& op, - const ClientRequest::PGPipeline::LockOBC& blocker) override { + void handle(CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { } - void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent& ev, - const Operation& op) override { + void handle(CommonOBCPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const CommonOBCPipeline::SendReply& blocker) override { } - void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev, + void handle(PGRepopPipeline::Process::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::Process& blocker) override { + const PGRepopPipeline::Process& blocker) override { } - void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev, + void handle(PGRepopPipeline::WaitCommit::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::WaitRepop& blocker) override { + const PGRepopPipeline::WaitCommit& blocker) override { } - void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + void handle(PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent& ev, const Operation& op) override { } - void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev, + void handle(PGRepopPipeline::SendReply::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::SendReply& blocker) override { + const PGRepopPipeline::SendReply& blocker) override { } void handle(ClientRequest::CompletionEvent&, const Operation&) override {} + }; struct HistoricBackend @@ -155,24 +154,23 @@ struct HistoricBackend ConnectionPipeline::AwaitMap::BlockingEvent::Backend, ConnectionPipeline::GetPGMapping::BlockingEvent::Backend, PerShardPipeline::CreateOrWaitPG::BlockingEvent::Backend, + CommonPGPipeline::WaitPGReady::BlockingEvent::Backend, + CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent::Backend, + CommonPGPipeline::GetOBC::BlockingEvent::Backend, OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, PGMap::PGCreationBlockingEvent::Backend, - ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend, PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend, - ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend, PGActivationBlocker::BlockingEvent::Backend, scrub::PGScrubber::BlockingEvent::Backend, - ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend, - ClientRequest::PGPipeline::RecoverMissing:: - BlockingEvent::ExitBarrierEvent::Backend, - ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend, - ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend, - ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend, - ClientRequest::PGPipeline::Process::BlockingEvent::Backend, - ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend, - ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, - ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend, - ClientRequest::CompletionEvent::Backend + ClientRequest::CompletionEvent::Backend, + CommonOBCPipeline::Process::BlockingEvent::Backend, + CommonOBCPipeline::WaitRepop::BlockingEvent::Backend, + CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend, + CommonOBCPipeline::SendReply::BlockingEvent::Backend, + PGRepopPipeline::Process::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::Backend, + PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent::Backend, + PGRepopPipeline::SendReply::BlockingEvent::Backend { void handle(ClientRequest::StartEvent&, const Operation&) override {} @@ -202,24 +200,28 @@ struct HistoricBackend const PerShardPipeline::CreateOrWaitPG& blocker) override { } - void handle(PGMap::PGCreationBlockingEvent&, - const Operation&, - const PGMap::PGCreationBlocker&) override { + void handle(CommonPGPipeline::WaitPGReady::BlockingEvent& ev, + const Operation& op, + const CommonPGPipeline::WaitPGReady& blocker) override { + } + + void handle(CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent& ev, + const Operation& op) override { } - void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev, + void handle(CommonPGPipeline::GetOBC::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::AwaitMap& blocker) override { + const CommonPGPipeline::GetOBC& blocker) override { } - void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + void handle(PGMap::PGCreationBlockingEvent&, const Operation&, - const PG_OSDMapGate::OSDMapBlocker&) override { + const PGMap::PGCreationBlocker&) override { } - void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev, - const Operation& op, - const ClientRequest::PGPipeline::WaitForActive& blocker) override { + void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&, + const Operation&, + const PG_OSDMapGate::OSDMapBlocker&) override { } void handle(PGActivationBlocker::BlockingEvent& ev, @@ -232,55 +234,52 @@ struct HistoricBackend const scrub::PGScrubber& blocker) override { } - void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev, - const Operation& op, - const ClientRequest::PGPipeline::RecoverMissing& blocker) override { - } - - void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::ExitBarrierEvent& ev, - const Operation& op) override { + static const ClientRequest& to_client_request(const Operation& op) { +#ifdef NDEBUG + return static_cast<const ClientRequest&>(op); +#else + return dynamic_cast<const ClientRequest&>(op); +#endif } - void handle(ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent& ev, + void handle(CommonOBCPipeline::Process::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override { + const CommonOBCPipeline::Process& blocker) override { } - void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev, + void handle(CommonOBCPipeline::WaitRepop::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::LockOBC& blocker) override { + const CommonOBCPipeline::WaitRepop& blocker) override { } - void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent& ev, + void handle(CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, const Operation& op) override { } - void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev, + void handle(CommonOBCPipeline::SendReply::BlockingEvent& ev, + const Operation& op, + const CommonOBCPipeline::SendReply& blocker) override { + } + + void handle(PGRepopPipeline::Process::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::Process& blocker) override { + const PGRepopPipeline::Process& blocker) override { } - void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev, + void handle(PGRepopPipeline::WaitCommit::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::WaitRepop& blocker) override { + const PGRepopPipeline::WaitCommit& blocker) override { } - void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev, + void handle(PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent& ev, const Operation& op) override { } - void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev, + void handle(PGRepopPipeline::SendReply::BlockingEvent& ev, const Operation& op, - const ClientRequest::PGPipeline::SendReply& blocker) override { + const PGRepopPipeline::SendReply& blocker) override { } - static const ClientRequest& to_client_request(const Operation& op) { -#ifdef NDEBUG - return static_cast<const ClientRequest&>(op); -#else - return dynamic_cast<const ClientRequest&>(op); -#endif - } void handle(ClientRequest::CompletionEvent&, const Operation& op) override { if (crimson::common::local_conf()->osd_op_history_size) { diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc index a89fb2c84bc..fcd0f318db2 100644 --- a/src/crimson/osd/osd_operations/client_request.cc +++ b/src/crimson/osd/osd_operations/client_request.cc @@ -14,6 +14,7 @@ #include "crimson/osd/osd_operations/client_request.h" #include "crimson/osd/osd_connection_priv.h" #include "osd/object_state_fmt.h" +#include "osd/osd_perf_counters.h" SET_SUBSYS(osd); @@ -42,15 +43,17 @@ void ClientRequest::Orderer::clear_and_cancel(PG &pg) { LOG_PREFIX(ClientRequest::Orderer::clear_and_cancel); for (auto i = list.begin(); i != list.end(); ) { - DEBUGDPP("{}", pg, *i); - i->complete_request(); - remove_request(*(i++)); + auto &req = *i; + DEBUGDPP("{}", pg, req); + ++i; + req.complete_request(pg); } } -void ClientRequest::complete_request() +void ClientRequest::complete_request(PG &pg) { track_event<CompletionEvent>(); + pg.client_request_orderer.remove_request(*this); on_complete.set_value(); } @@ -98,7 +101,7 @@ PerShardPipeline &ClientRequest::get_pershard_pipeline( return shard_services.get_client_request_pipeline(); } -ClientRequest::PGPipeline &ClientRequest::client_pp(PG &pg) +CommonPGPipeline &ClientRequest::client_pp(PG &pg) { return pg.request_pg_pipeline; } @@ -137,12 +140,20 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib DEBUGDPP("{} start", *pgref, *this); PG &pg = *pgref; + + DEBUGDPP("{}.{}: entering wait_pg_ready stage", + *pgref, *this, this_instance_id); + // The prior stage is OrderedExclusive (PerShardPipeline::create_or_wait_pg) + // and wait_pg_ready is OrderedConcurrent. This transition, therefore, cannot + // block and using enter_stage_sync is legal and more efficient than + // enter_stage. + ihref.enter_stage_sync(client_pp(pg).wait_pg_ready, *this); + if (!m->get_hobj().get_key().empty()) { // There are no users of locator. It was used to ensure that multipart-upload // parts would end up in the same PG so that they could be clone_range'd into // the same object via librados, but that's not how multipart upload works // anymore and we no longer support clone_range via librados. - get_handle().exit(); co_await reply_op_error(pgref, -ENOTSUP); co_return; } @@ -152,32 +163,24 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib std::ref(get_foreign_connection()), m->get_map_epoch() )); DEBUGDPP("{}: discarding {}", *pgref, *this, this_instance_id); - pgref->client_request_orderer.remove_request(*this); - complete_request(); co_return; } - DEBUGDPP("{}.{}: entering await_map stage", - *pgref, *this, this_instance_id); - co_await ihref.enter_stage<interruptor>(client_pp(pg).await_map, *this); - DEBUGDPP("{}.{}: entered await_map stage, waiting for map", - pg, *this, this_instance_id); + auto map_epoch = co_await interruptor::make_interruptible( ihref.enter_blocker( *this, pg.osdmap_gate, &decltype(pg.osdmap_gate)::wait_for_map, m->get_min_epoch(), nullptr)); - DEBUGDPP("{}.{}: map epoch got {}, entering wait_for_active", + DEBUGDPP("{}.{}: waited for epoch {}, waiting for active", pg, *this, this_instance_id, map_epoch); - co_await ihref.enter_stage<interruptor>(client_pp(pg).wait_for_active, *this); - - DEBUGDPP("{}.{}: entered wait_for_active stage, waiting for active", - pg, *this, this_instance_id); co_await interruptor::make_interruptible( ihref.enter_blocker( *this, pg.wait_for_active_blocker, &decltype(pg.wait_for_active_blocker)::wait)); + co_await ihref.enter_stage<interruptor>(client_pp(pg).get_obc, *this); + if (int res = op_info.set_from_op(&*m, *pg.get_osdmap()); res != 0) { co_await reply_op_error(pgref, res); @@ -190,15 +193,25 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib DEBUGDPP("{}.{}: dropping misdirected op", pg, *this, this_instance_id); co_return; - } else if (const hobject_t& hoid = m->get_hobj(); - !pg.get_peering_state().can_serve_replica_read(hoid)) { + } + + pg.get_perf_logger().inc(l_osd_replica_read); + if (pg.is_unreadable_object(m->get_hobj())) { + DEBUGDPP("{}.{}: {} missing on replica, bouncing to primary", + pg, *this, this_instance_id, m->get_hobj()); + pg.get_perf_logger().inc(l_osd_replica_read_redirect_missing); + co_await reply_op_error(pgref, -EAGAIN); + co_return; + } else if (!pg.get_peering_state().can_serve_replica_read(m->get_hobj())) { DEBUGDPP("{}.{}: unstable write on replica, bouncing to primary", pg, *this, this_instance_id); + pg.get_perf_logger().inc(l_osd_replica_read_redirect_conflict); co_await reply_op_error(pgref, -EAGAIN); co_return; } else { DEBUGDPP("{}.{}: serving replica read on oid {}", pg, *this, this_instance_id, m->get_hobj()); + pg.get_perf_logger().inc(l_osd_replica_read_served); } } @@ -228,12 +241,6 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib DEBUGDPP("{}.{}: process[_pg]_op complete, completing handle", *pgref, *this, this_instance_id); co_await interruptor::make_interruptible(ihref.handle.complete()); - - DEBUGDPP("{}.{}: process[_pg]_op complete," - "removing request from orderer", - *pgref, *this, this_instance_id); - pgref->client_request_orderer.remove_request(*this); - complete_request(); } seastar::future<> ClientRequest::with_pg_process( @@ -249,16 +256,24 @@ seastar::future<> ClientRequest::with_pg_process( auto instance_handle = get_instance_handle(); auto &ihref = *instance_handle; return interruptor::with_interruption( - [this, pgref, this_instance_id, &ihref]() mutable { - return with_pg_process_interruptible(pgref, this_instance_id, ihref); + [FNAME, this, pgref, this_instance_id, &ihref]() mutable { + return with_pg_process_interruptible( + pgref, this_instance_id, ihref + ).then_interruptible([FNAME, this, this_instance_id, pgref] { + DEBUGDPP("{}.{}: with_pg_process_interruptible complete," + " completing request", + *pgref, *this, this_instance_id); + complete_request(*pgref); + }); }, [FNAME, this, this_instance_id, pgref](std::exception_ptr eptr) { DEBUGDPP("{}.{}: interrupted due to {}", *pgref, *this, this_instance_id, eptr); }, pgref, pgref->get_osdmap_epoch()).finally( [this, FNAME, opref=std::move(opref), pgref, - this_instance_id, instance_handle=std::move(instance_handle), &ihref] { + this_instance_id, instance_handle=std::move(instance_handle), &ihref]() mutable { DEBUGDPP("{}.{}: exit", *pgref, *this, this_instance_id); - ihref.handle.exit(); + return ihref.handle.complete( + ).finally([instance_handle=std::move(instance_handle)] {}); }); } @@ -290,29 +305,41 @@ ClientRequest::process_pg_op( ClientRequest::interruptible_future<> ClientRequest::recover_missing_snaps( Ref<PG> pg, - instance_handle_t &ihref, - ObjectContextRef head, std::set<snapid_t> &snaps) { LOG_PREFIX(ClientRequest::recover_missing_snaps); - for (auto &snap : snaps) { - auto coid = head->obs.oi.soid; - coid.snap = snap; - auto oid = resolve_oid(head->get_head_ss(), coid); - /* Rollback targets may legitimately not exist if, for instance, - * the object is an rbd block which happened to be sparse and - * therefore non-existent at the time of the specified snapshot. - * In such a case, rollback will simply delete the object. Here, - * we skip the oid as there is no corresponding clone to recover. - * See https://tracker.ceph.com/issues/63821 */ - if (oid) { - auto unfound = co_await do_recover_missing(pg, *oid, m->get_reqid()); - if (unfound) { - DEBUGDPP("{} unfound, hang it for now", *pg, *oid); - co_await interruptor::make_interruptible( - pg->get_recovery_backend()->add_unfound(*oid)); + + std::vector<hobject_t> ret; + auto resolve_oids = pg->obc_loader.with_obc<RWState::RWREAD>( + m->get_hobj().get_head(), + [&snaps, &ret](auto head, auto) { + for (auto &snap : snaps) { + auto coid = head->obs.oi.soid; + coid.snap = snap; + auto oid = resolve_oid(head->get_head_ss(), coid); + /* Rollback targets may legitimately not exist if, for instance, + * the object is an rbd block which happened to be sparse and + * therefore non-existent at the time of the specified snapshot. + * In such a case, rollback will simply delete the object. Here, + * we skip the oid as there is no corresponding clone to recover. + * See https://tracker.ceph.com/issues/63821 */ + if (oid) { + ret.emplace_back(std::move(*oid)); } } + return seastar::now(); + }).handle_error_interruptible( + crimson::ct_error::assert_all("unexpected error") + ); + co_await std::move(resolve_oids); + + for (auto &oid : ret) { + auto unfound = co_await do_recover_missing(pg, oid, m->get_reqid()); + if (unfound) { + DEBUGDPP("{} unfound, hang it for now", *pg, oid); + co_await interruptor::make_interruptible( + pg->get_recovery_backend()->add_unfound(oid)); + } } } @@ -321,7 +348,13 @@ ClientRequest::process_op( instance_handle_t &ihref, Ref<PG> pg, unsigned this_instance_id) { LOG_PREFIX(ClientRequest::process_op); - ihref.enter_stage_sync(client_pp(*pg).recover_missing, *this); + ihref.obc_orderer = pg->obc_loader.get_obc_orderer(m->get_hobj()); + auto obc_manager = pg->obc_loader.get_obc_manager( + *(ihref.obc_orderer), + m->get_hobj()); + co_await ihref.enter_stage<interruptor>( + ihref.obc_orderer->obc_pp().process, *this); + if (!pg->is_primary()) { DEBUGDPP( "Skipping recover_missings on non primary pg for soid {}", @@ -337,28 +370,10 @@ ClientRequest::process_op( std::set<snapid_t> snaps = snaps_need_to_recover(); if (!snaps.empty()) { - auto with_obc = pg->obc_loader.with_obc<RWState::RWREAD>( - m->get_hobj().get_head(), - [&snaps, &ihref, pg, this](auto head, auto) { - return recover_missing_snaps(pg, ihref, head, snaps); - }).handle_error_interruptible( - crimson::ct_error::assert_all("unexpected error") - ); - // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98401 - co_await std::move(with_obc); + co_await recover_missing_snaps(pg, snaps); } } - /** - * The previous stage of recover_missing is a concurrent phase. - * Checking for already_complete requests must done exclusively. - * Since get_obc is also an exclusive stage, we can merge both stages into - * a single stage and avoid stage switching overhead. - */ - DEBUGDPP("{}.{}: entering check_already_complete_get_obc", - *pg, *this, this_instance_id); - co_await ihref.enter_stage<interruptor>( - client_pp(*pg).check_already_complete_get_obc, *this); DEBUGDPP("{}.{}: checking already_complete", *pg, *this, this_instance_id); auto completed = co_await pg->already_complete(m->get_reqid()); @@ -385,51 +400,29 @@ ClientRequest::process_op( DEBUGDPP("{}.{}: past scrub blocker, getting obc", *pg, *this, this_instance_id); - // call with_locked_obc() in order, but wait concurrently for loading. - ihref.enter_stage_sync( - client_pp(*pg).lock_obc, *this); - auto process = pg->with_locked_obc( - m->get_hobj(), op_info, - [FNAME, this, pg, this_instance_id, &ihref] ( - auto head, auto obc - ) -> interruptible_future<> { - DEBUGDPP("{}.{}: got obc {}, entering process stage", - *pg, *this, this_instance_id, obc->obs); - return ihref.enter_stage<interruptor>( - client_pp(*pg).process, *this - ).then_interruptible( - [FNAME, this, pg, this_instance_id, obc, &ihref]() mutable { - DEBUGDPP("{}.{}: in process stage, calling do_process", - *pg, *this, this_instance_id); - return do_process( - ihref, pg, obc, this_instance_id - ); - } - ); - }).handle_error_interruptible( - PG::load_obc_ertr::all_same_way( - [FNAME, this, pg=std::move(pg), this_instance_id]( - const auto &code - ) -> interruptible_future<> { - DEBUGDPP("{}.{}: saw error code {}", - *pg, *this, this_instance_id, code); - assert(code.value() > 0); - return reply_op_error(pg, -code.value()); - }) - ); - /* The following works around gcc bug - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98401. - * The specific symptom I observed is the pg param being - * destructed multiple times resulting in the refcount going - * rapidly to 0 destoying the PG prematurely. - * - * This bug seems to be resolved in gcc 13.2.1. - * - * Assigning the intermediate result and moving it into the co_await - * expression bypasses both bugs. - */ - co_await std::move(process); + int load_err = co_await pg->obc_loader.load_and_lock( + obc_manager, pg->get_lock_type(op_info) + ).si_then([]() -> int { + return 0; + }).handle_error_interruptible( + PG::load_obc_ertr::all_same_way( + [](const auto &code) -> int { + return -code.value(); + }) + ); + if (load_err) { + DEBUGDPP("{}.{}: saw error code loading obc {}", + *pg, *this, this_instance_id, load_err); + co_await reply_op_error(pg, load_err); + co_return; + } + + DEBUGDPP("{}.{}: obc {} loaded and locked, calling do_process", + *pg, *this, this_instance_id, obc_manager.get_obc()->obs); + co_await do_process( + ihref, pg, obc_manager.get_obc(), this_instance_id + ); } ClientRequest::interruptible_future<> @@ -548,12 +541,14 @@ ClientRequest::do_process( std::move(ox), m->ops); co_await std::move(submitted); } - co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this); + co_await ihref.enter_stage<interruptor>( + ihref.obc_orderer->obc_pp().wait_repop, *this); co_await std::move(all_completed); } - co_await ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this); + co_await ihref.enter_stage<interruptor>( + ihref.obc_orderer->obc_pp().send_reply, *this); if (ret) { int err = -ret->value(); diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index 6ee57e9874c..91a6728fd4b 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -11,6 +11,7 @@ #include "osd/osd_op_util.h" #include "crimson/net/Connection.h" #include "crimson/osd/object_context.h" +#include "crimson/osd/object_context_loader.h" #include "crimson/osd/osdmap_gate.h" #include "crimson/osd/osd_operation.h" #include "crimson/osd/osd_operations/client_request_common.h" @@ -41,21 +42,9 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>, unsigned instance_id = 0; public: - class PGPipeline : public CommonPGPipeline { - public: - struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> { - static constexpr auto type_name = "ClientRequest::PGPipeline::await_map"; - } await_map; - struct SendReply : OrderedExclusivePhaseT<SendReply> { - static constexpr auto type_name = "ClientRequest::PGPipeline::send_reply"; - } send_reply; - friend class ClientRequest; - friend class LttngBackend; - friend class HistoricBackend; - friend class ReqRequest; - friend class LogMissingRequest; - friend class LogMissingRequestReply; - }; + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } /** * instance_handle_t @@ -93,20 +82,18 @@ public: // don't leave any references on the source core, so we just bypass it by using // intrusive_ptr instead. using ref_t = boost::intrusive_ptr<instance_handle_t>; + std::optional<ObjectContextLoader::Orderer> obc_orderer; PipelineHandle handle; std::tuple< - PGPipeline::AwaitMap::BlockingEvent, + CommonPGPipeline::WaitPGReady::BlockingEvent, PG_OSDMapGate::OSDMapBlocker::BlockingEvent, - PGPipeline::WaitForActive::BlockingEvent, PGActivationBlocker::BlockingEvent, - PGPipeline::RecoverMissing::BlockingEvent, + CommonPGPipeline::GetOBC::BlockingEvent, + CommonOBCPipeline::Process::BlockingEvent, scrub::PGScrubber::BlockingEvent, - PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent, - PGPipeline::LockOBC::BlockingEvent, - PGPipeline::Process::BlockingEvent, - PGPipeline::WaitRepop::BlockingEvent, - PGPipeline::SendReply::BlockingEvent, + CommonOBCPipeline::WaitRepop::BlockingEvent, + CommonOBCPipeline::SendReply::BlockingEvent, CompletionEvent > pg_tracking_events; @@ -210,7 +197,7 @@ public: void requeue(Ref<PG> pg); void clear_and_cancel(PG &pg); }; - void complete_request(); + void complete_request(PG &pg); static constexpr OperationTypeCode type = OperationTypeCode::client_request; @@ -285,8 +272,6 @@ private: interruptible_future<> recover_missing_snaps( Ref<PG> pg, - instance_handle_t &ihref, - ObjectContextRef head, std::set<snapid_t> &snaps); ::crimson::interruptible::interruptible_future< ::crimson::osd::IOInterruptCondition> process_op( @@ -295,7 +280,7 @@ private: unsigned this_instance_id); bool is_pg_op() const; - PGPipeline &client_pp(PG &pg); + CommonPGPipeline &client_pp(PG &pg); template <typename Errorator> using interruptible_errorator = diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc index 9e5867caf80..b8f7646bc74 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.cc +++ b/src/crimson/osd/osd_operations/internal_client_request.cc @@ -4,6 +4,7 @@ #include <seastar/core/future.hh> #include "crimson/osd/osd_operations/internal_client_request.h" +#include "osd/object_state_fmt.h" namespace { seastar::logger& logger() { @@ -51,46 +52,17 @@ CommonPGPipeline& InternalClientRequest::client_pp() } InternalClientRequest::interruptible_future<> -InternalClientRequest::do_process( - crimson::osd::ObjectContextRef obc, - std::vector<OSDOp> &osd_ops) -{ - LOG_PREFIX(InternalClientRequest::do_process); - auto params = get_do_osd_ops_params(); - OpsExecuter ox( - pg, obc, op_info, params, params.get_connection(), SnapContext{}); - co_await pg->run_executer( - ox, obc, op_info, osd_ops - ).handle_error_interruptible( - crimson::ct_error::all_same_way( - [this, FNAME](auto e) { - ERRORDPPI("{}: got unexpected error {}", *pg, *this, e); - ceph_assert(0 == "should not return an error"); - return interruptor::now(); - }) - ); - - auto [submitted, completed] = co_await pg->submit_executer( - std::move(ox), osd_ops); - - co_await std::move(submitted); - co_await std::move(completed); -} - -InternalClientRequest::interruptible_future<> InternalClientRequest::with_interruption() { LOG_PREFIX(InternalClientRequest::with_interruption); - co_await enter_stage<interruptor>( - client_pp().wait_for_active - ); + assert(pg->is_active()); - co_await with_blocking_event<PGActivationBlocker::BlockingEvent, - interruptor>([this] (auto&& trigger) { - return pg->wait_for_active_blocker.wait(std::move(trigger)); - }); + obc_orderer = pg->obc_loader.get_obc_orderer(get_target_oid()); + auto obc_manager = pg->obc_loader.get_obc_manager( + *obc_orderer, + get_target_oid()); - co_await enter_stage<interruptor>(client_pp().recover_missing); + co_await enter_stage<interruptor>(obc_orderer->obc_pp().process); bool unfound = co_await do_recover_missing( pg, get_target_oid(), osd_reqid_t()); @@ -100,10 +72,8 @@ InternalClientRequest::with_interruption() std::make_error_code(std::errc::operation_canceled), fmt::format("{} is unfound, drop it!", get_target_oid())); } - co_await enter_stage<interruptor>( - client_pp().check_already_complete_get_obc); - DEBUGI("{}: getting obc lock", *this); + DEBUGI("{}: generating ops", *this); auto osd_ops = create_osd_ops(); @@ -112,23 +82,38 @@ InternalClientRequest::with_interruption() [[maybe_unused]] const int ret = op_info.set_from_op( std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap()); assert(ret == 0); - // call with_locked_obc() in order, but wait concurrently for loading. - enter_stage_sync(client_pp().lock_obc); - - auto fut = pg->with_locked_obc( - get_target_oid(), op_info, - [&osd_ops, this](auto, auto obc) { - return enter_stage<interruptor>(client_pp().process - ).then_interruptible( - [obc=std::move(obc), &osd_ops, this]() mutable { - return do_process(std::move(obc), osd_ops); - }); - }).handle_error_interruptible( - crimson::ct_error::assert_all("unexpected error") - ); - co_await std::move(fut); - - logger().debug("{}: complete", *this); + + co_await pg->obc_loader.load_and_lock( + obc_manager, pg->get_lock_type(op_info) + ).handle_error_interruptible( + crimson::ct_error::assert_all("unexpected error") + ); + + auto params = get_do_osd_ops_params(); + OpsExecuter ox( + pg, obc_manager.get_obc(), op_info, params, params.get_connection(), + SnapContext{}); + co_await pg->run_executer( + ox, obc_manager.get_obc(), op_info, osd_ops + ).handle_error_interruptible( + crimson::ct_error::all_same_way( + [this, FNAME](auto e) { + ERRORDPPI("{}: got unexpected error {}", *pg, *this, e); + ceph_assert(0 == "should not return an error"); + return interruptor::now(); + }) + ); + + auto [submitted, completed] = co_await pg->submit_executer( + std::move(ox), osd_ops); + + co_await std::move(submitted); + + co_await enter_stage<interruptor>(obc_orderer->obc_pp().wait_repop); + + co_await std::move(completed); + + DEBUGDPP("{}: complete", *pg, *this); co_await interruptor::make_interruptible(handle.complete()); co_return; } @@ -150,7 +135,7 @@ seastar::future<> InternalClientRequest::start() return seastar::now(); }).finally([this] { logger().debug("{}: exit", *this); - handle.exit(); + return handle.complete(); }); } diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h index 6023db0a8db..1cfde4ab080 100644 --- a/src/crimson/osd/osd_operations/internal_client_request.h +++ b/src/crimson/osd/osd_operations/internal_client_request.h @@ -4,6 +4,7 @@ #pragma once #include "crimson/common/type_helpers.h" +#include "crimson/osd/object_context_loader.h" #include "crimson/osd/osd_operation.h" #include "crimson/osd/osd_operations/client_request_common.h" #include "crimson/osd/pg.h" @@ -45,11 +46,10 @@ private: crimson::osd::ObjectContextRef obc, std::vector<OSDOp> &osd_ops); - seastar::future<> do_process(); - Ref<PG> pg; epoch_t start_epoch; OpInfo op_info; + std::optional<ObjectContextLoader::Orderer> obc_orderer; PipelineHandle handle; public: @@ -57,12 +57,8 @@ public: std::tuple< StartEvent, - CommonPGPipeline::WaitForActive::BlockingEvent, - PGActivationBlocker::BlockingEvent, - CommonPGPipeline::RecoverMissing::BlockingEvent, - CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent, - CommonPGPipeline::LockOBC::BlockingEvent, - CommonPGPipeline::Process::BlockingEvent, + CommonOBCPipeline::Process::BlockingEvent, + CommonOBCPipeline::WaitRepop::BlockingEvent, CompletionEvent > tracking_events; }; diff --git a/src/crimson/osd/osd_operations/logmissing_request.cc b/src/crimson/osd/osd_operations/logmissing_request.cc index 8147c969260..274744cdd92 100644 --- a/src/crimson/osd/osd_operations/logmissing_request.cc +++ b/src/crimson/osd/osd_operations/logmissing_request.cc @@ -58,9 +58,9 @@ PerShardPipeline &LogMissingRequest::get_pershard_pipeline( return shard_services.get_replicated_request_pipeline(); } -ClientRequest::PGPipeline &LogMissingRequest::client_pp(PG &pg) +PGRepopPipeline &LogMissingRequest::repop_pipeline(PG &pg) { - return pg.request_pg_pipeline; + return pg.repop_pipeline; } seastar::future<> LogMissingRequest::with_pg( @@ -73,7 +73,7 @@ seastar::future<> LogMissingRequest::with_pg( return interruptor::with_interruption([this, pg] { LOG_PREFIX(LogMissingRequest::with_pg); DEBUGI("{}: pg present", *this); - return this->template enter_stage<interruptor>(client_pp(*pg).await_map + return this->template enter_stage<interruptor>(repop_pipeline(*pg).process ).then_interruptible([this, pg] { return this->template with_blocking_event< PG_OSDMapGate::OSDMapBlocker::BlockingEvent diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h index 51c9d540cb5..fe4761c4ab4 100644 --- a/src/crimson/osd/osd_operations/logmissing_request.h +++ b/src/crimson/osd/osd_operations/logmissing_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); @@ -77,14 +80,14 @@ public: ConnectionPipeline::AwaitMap::BlockingEvent, ConnectionPipeline::GetPGMapping::BlockingEvent, PerShardPipeline::CreateOrWaitPG::BlockingEvent, - ClientRequest::PGPipeline::AwaitMap::BlockingEvent, + PGRepopPipeline::Process::BlockingEvent, PG_OSDMapGate::OSDMapBlocker::BlockingEvent, PGMap::PGCreationBlockingEvent, OSD_OSDMapGate::OSDMapBlocker::BlockingEvent > tracking_events; private: - ClientRequest::PGPipeline &client_pp(PG &pg); + PGRepopPipeline &repop_pipeline(PG &pg); crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.cc b/src/crimson/osd/osd_operations/logmissing_request_reply.cc index fb122a95cd1..5640610bd01 100644 --- a/src/crimson/osd/osd_operations/logmissing_request_reply.cc +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.cc @@ -56,11 +56,6 @@ PerShardPipeline &LogMissingRequestReply::get_pershard_pipeline( return shard_services.get_replicated_request_pipeline(); } -ClientRequest::PGPipeline &LogMissingRequestReply::client_pp(PG &pg) -{ - return pg.request_pg_pipeline; -} - seastar::future<> LogMissingRequestReply::with_pg( ShardServices &shard_services, Ref<PG> pg) { diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h index c741b41bd0f..bdb6c2ac6ac 100644 --- a/src/crimson/osd/osd_operations/logmissing_request_reply.h +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); @@ -82,8 +85,6 @@ public: > tracking_events; private: - ClientRequest::PGPipeline &client_pp(PG &pg); - crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h index 102cb7fff6b..14202582100 100644 --- a/src/crimson/osd/osd_operations/osdop_params.h +++ b/src/crimson/osd/osd_operations/osdop_params.h @@ -12,7 +12,7 @@ struct osd_op_params_t { utime_t mtime; eversion_t at_version; eversion_t pg_trim_to; - eversion_t min_last_complete_ondisk; + eversion_t pg_committed_to; eversion_t last_complete; bool user_modify = false; ObjectCleanRegions clean_regions; diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc index a8d9fce69b6..fb5696b0a9e 100644 --- a/src/crimson/osd/osd_operations/peering_event.cc +++ b/src/crimson/osd/osd_operations/peering_event.cc @@ -166,7 +166,8 @@ void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services) ctx.send_notify(q.from.osd, {q.query.from, q.query.to, q.query.epoch_sent, map_epoch, empty, - PastIntervals{}}); + PastIntervals{}, + PG_FEATURE_CRIMSON_ALL}); } } } diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h index 85de5c711d6..aa6b8a95a94 100644 --- a/src/crimson/osd/osd_operations/peering_event.h +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -44,6 +44,10 @@ protected: float delay = 0; PGPeeringEvent evt; + epoch_t get_epoch_sent_at() const { + return evt.get_epoch_sent(); + } + const pg_shard_t get_from() const { return from; } @@ -84,6 +88,10 @@ public: evt(std::forward<Args>(args)...) {} + bool requires_pg() const final { + return evt.requires_pg; + } + void print(std::ostream &) const final; void dump_detail(ceph::Formatter* f) const final; seastar::future<> with_pg( diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h index 43be7319545..21702f6ff4f 100644 --- a/src/crimson/osd/osd_operations/pg_advance_map.h +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -50,6 +50,10 @@ public: PGPeeringPipeline::Process::BlockingEvent > tracking_events; + epoch_t get_epoch_sent_at() const { + return to; + } + private: PGPeeringPipeline &peering_pp(PG &pg); }; diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h index 17c2faf97ea..2fe8ff372b3 100644 --- a/src/crimson/osd/osd_operations/recovery_subrequest.h +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -39,6 +39,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return m->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc index 5ca11e5dd15..ec607758c55 100644 --- a/src/crimson/osd/osd_operations/replicated_request.cc +++ b/src/crimson/osd/osd_operations/replicated_request.cc @@ -5,6 +5,7 @@ #include "common/Formatter.h" +#include "crimson/common/coroutine.h" #include "crimson/osd/osd.h" #include "crimson/osd/osd_connection_priv.h" #include "crimson/osd/osd_operation_external_tracking.h" @@ -58,39 +59,57 @@ PerShardPipeline &RepRequest::get_pershard_pipeline( return shard_services.get_replicated_request_pipeline(); } -ClientRequest::PGPipeline &RepRequest::client_pp(PG &pg) +PGRepopPipeline &RepRequest::repop_pipeline(PG &pg) { - return pg.request_pg_pipeline; + return pg.repop_pipeline; +} + +RepRequest::interruptible_future<> RepRequest::with_pg_interruptible( + Ref<PG> pg) +{ + LOG_PREFIX(RepRequest::with_pg_interruptible); + DEBUGI("{}", *this); + co_await this->template enter_stage<interruptor>(repop_pipeline(*pg).process); + co_await interruptor::make_interruptible(this->template with_blocking_event< + PG_OSDMapGate::OSDMapBlocker::BlockingEvent + >([this, pg](auto &&trigger) { + return pg->osdmap_gate.wait_for_map( + std::move(trigger), req->min_epoch); + })); + + if (pg->can_discard_replica_op(*req)) { + co_return; + } + + auto [commit_fut, reply] = co_await pg->handle_rep_op(req); + + // Transitions from OrderedExclusive->OrderedConcurrent cannot block + this->template enter_stage_sync(repop_pipeline(*pg).wait_commit); + + co_await std::move(commit_fut); + + co_await this->template enter_stage<interruptor>( + repop_pipeline(*pg).send_reply); + + co_await interruptor::make_interruptible( + pg->shard_services.send_to_osd( + req->from.osd, std::move(reply), pg->get_osdmap_epoch()) + ); } seastar::future<> RepRequest::with_pg( ShardServices &shard_services, Ref<PG> pg) { LOG_PREFIX(RepRequest::with_pg); - DEBUGI("{}: RepRequest::with_pg", *this); + DEBUGI("{}", *this); IRef ref = this; return interruptor::with_interruption([this, pg] { - LOG_PREFIX(RepRequest::with_pg); - DEBUGI("{}: pg present", *this); - return this->template enter_stage<interruptor>(client_pp(*pg).await_map - ).then_interruptible([this, pg] { - return this->template with_blocking_event< - PG_OSDMapGate::OSDMapBlocker::BlockingEvent - >([this, pg](auto &&trigger) { - return pg->osdmap_gate.wait_for_map( - std::move(trigger), req->min_epoch); - }); - }).then_interruptible([this, pg] (auto) { - return pg->handle_rep_op(req); - }).then_interruptible([this] { - logger().debug("{}: complete", *this); - return handle.complete(); - }); + return with_pg_interruptible(pg); }, [](std::exception_ptr) { return seastar::now(); }, pg, pg->get_osdmap_epoch()).finally([this, ref=std::move(ref)] { logger().debug("{}: exit", *this); - handle.exit(); + return handle.complete(); }); } diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h index ff5dea6d6db..c2494b3715f 100644 --- a/src/crimson/osd/osd_operations/replicated_request.h +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); @@ -68,6 +71,9 @@ public: r_conn = make_local_shared_foreign(std::move(conn)); } + interruptible_future<> with_pg_interruptible( + Ref<PG> pg); + seastar::future<> with_pg( ShardServices &shard_services, Ref<PG> pg); @@ -77,14 +83,16 @@ public: ConnectionPipeline::AwaitMap::BlockingEvent, ConnectionPipeline::GetPGMapping::BlockingEvent, PerShardPipeline::CreateOrWaitPG::BlockingEvent, - ClientRequest::PGPipeline::AwaitMap::BlockingEvent, + PGRepopPipeline::Process::BlockingEvent, + PGRepopPipeline::WaitCommit::BlockingEvent, + PGRepopPipeline::SendReply::BlockingEvent, PG_OSDMapGate::OSDMapBlocker::BlockingEvent, PGMap::PGCreationBlockingEvent, OSD_OSDMapGate::OSDMapBlocker::BlockingEvent > tracking_events; private: - ClientRequest::PGPipeline &client_pp(PG &pg); + PGRepopPipeline &repop_pipeline(PG &pg); crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; diff --git a/src/crimson/osd/osd_operations/scrub_events.h b/src/crimson/osd/osd_operations/scrub_events.h index 02a5d852bb7..8bed90e4c14 100644 --- a/src/crimson/osd/osd_operations/scrub_events.h +++ b/src/crimson/osd/osd_operations/scrub_events.h @@ -27,11 +27,11 @@ class RemoteScrubEventBaseT : public PhasedOperationT<T> { crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; - epoch_t epoch; spg_t pgid; protected: using interruptor = InterruptibleOperation::interruptor; + epoch_t epoch; template <typename U=void> using ifut = InterruptibleOperation::interruptible_future<U>; @@ -40,7 +40,7 @@ protected: public: RemoteScrubEventBaseT( crimson::net::ConnectionRef conn, epoch_t epoch, spg_t pgid) - : l_conn(std::move(conn)), epoch(epoch), pgid(pgid) {} + : l_conn(std::move(conn)), pgid(pgid), epoch(epoch) {} PGPeeringPipeline &get_peering_pipeline(PG &pg); @@ -117,6 +117,10 @@ public: : RemoteScrubEventBaseT<ScrubRequested>(std::forward<Args>(base_args)...), deep(deep) {} + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(deep=" << deep << ")"; } @@ -141,6 +145,10 @@ public: ceph_assert(scrub::PGScrubber::is_scrub_message(*m)); } + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(m=" << *m << ")"; } diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc index 9ed0b73cfb4..f8fb7aef6f2 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.cc +++ b/src/crimson/osd/osd_operations/snaptrim_event.cc @@ -388,58 +388,66 @@ SnapTrimObjSubEvent::remove_or_update( SnapTrimObjSubEvent::snap_trim_obj_subevent_ret_t SnapTrimObjSubEvent::start() { + obc_orderer = pg->obc_loader.get_obc_orderer( + coid); + ceph_assert(pg->is_active_clean()); - auto exit_handle = seastar::defer([this] { - logger().debug("{}: exit", *this); - handle.exit(); + auto exit_handle = seastar::defer([this, opref = IRef(this)] { + logger().debug("{}: exit", *opref); + std::ignore = handle.complete().then([opref = std::move(opref)] {}); }); co_await enter_stage<interruptor>( - client_pp().check_already_complete_get_obc); + obc_orderer->obc_pp().process); logger().debug("{}: getting obc for {}", *this, coid); - // end of commonality - // lock both clone's and head's obcs - co_await pg->obc_loader.with_obc<RWState::RWWRITE>( - coid, - std::bind(&SnapTrimObjSubEvent::process_and_submit, - this, std::placeholders::_1, std::placeholders::_2), - false + + + auto obc_manager = pg->obc_loader.get_obc_manager( + *obc_orderer, + coid, false /* resolve_oid */); + + co_await pg->obc_loader.load_and_lock( + obc_manager, RWState::RWWRITE ).handle_error_interruptible( remove_or_update_iertr::pass_further{}, crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"} ); - logger().debug("{}: completed", *this); - co_await interruptor::make_interruptible(handle.complete()); -} - -ObjectContextLoader::load_obc_iertr::future<> -SnapTrimObjSubEvent::process_and_submit(ObjectContextRef head_obc, - ObjectContextRef clone_obc) { - logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid()); - - co_await enter_stage<interruptor>(client_pp().process); + logger().debug("{}: got obc={}", *this, obc_manager.get_obc()->get_oid()); - logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid()); - - auto txn = co_await remove_or_update(clone_obc, head_obc); - - auto [submitted, all_completed] = co_await pg->submit_transaction( - std::move(clone_obc), - std::move(txn), - std::move(osd_op_p), - std::move(log_entries) - ); + auto all_completed = interruptor::now(); + { + // as with PG::submit_executer, we need to build the pg log entries + // and submit the transaction atomically + co_await interruptor::make_interruptible(pg->submit_lock.lock()); + auto unlocker = seastar::defer([this] { + pg->submit_lock.unlock(); + }); - co_await std::move(submitted); + logger().debug("{}: calling remove_or_update obc={}", + *this, obc_manager.get_obc()->get_oid()); + + auto txn = co_await remove_or_update( + obc_manager.get_obc(), obc_manager.get_head_obc()); + + auto submitted = interruptor::now(); + std::tie(submitted, all_completed) = co_await pg->submit_transaction( + ObjectContextRef(obc_manager.get_obc()), + nullptr, + std::move(txn), + std::move(osd_op_p), + std::move(log_entries) + ); + co_await std::move(submitted); + } - co_await enter_stage<interruptor>(client_pp().wait_repop); + co_await enter_stage<interruptor>(obc_orderer->obc_pp().wait_repop); co_await std::move(all_completed); - co_return; + logger().debug("{}: completed", *this); } void SnapTrimObjSubEvent::print(std::ostream &lhs) const diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h index 1164b3169d2..a2b4d357568 100644 --- a/src/crimson/osd/osd_operations/snaptrim_event.h +++ b/src/crimson/osd/osd_operations/snaptrim_event.h @@ -6,6 +6,7 @@ #include <iostream> #include <seastar/core/future.hh> +#include "crimson/osd/object_context_loader.h" #include "crimson/osd/osdmap_gate.h" #include "crimson/osd/osd_operation.h" #include "crimson/common/subop_blocker.h" @@ -112,10 +113,6 @@ public: private: object_stat_sum_t delta_stats; - ObjectContextLoader::load_obc_iertr::future<> process_and_submit( - ObjectContextRef head_obc, - ObjectContextRef clone_obc); - snap_trim_obj_subevent_ret_t remove_clone( ObjectContextRef obc, ObjectContextRef head_obc, @@ -158,6 +155,7 @@ private: } Ref<PG> pg; + std::optional<ObjectContextLoader::Orderer> obc_orderer; PipelineHandle handle; osd_op_params_t osd_op_p; const hobject_t coid; @@ -169,9 +167,8 @@ public: std::tuple< StartEvent, - CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent, - CommonPGPipeline::Process::BlockingEvent, - CommonPGPipeline::WaitRepop::BlockingEvent, + CommonOBCPipeline::Process::BlockingEvent, + CommonOBCPipeline::WaitRepop::BlockingEvent, CompletionEvent > tracking_events; }; diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 744a1dbc02b..2746e730f2b 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -132,6 +132,7 @@ PG::PG( pool, name), osdmap, + PG_FEATURE_CRIMSON_ALL, this, this), scrubber(*this), @@ -392,7 +393,13 @@ void PG::on_replica_activate() void PG::on_activate_complete() { - wait_for_active_blocker.unblock(); + /* Confusingly, on_activate_complete is invoked when the primary and replicas + * have recorded the current interval. At that point, the PG may either become + * ACTIVE or PEERED, depending on whether the acting set is eligible for client + * IO. Only unblock wait_for_active_blocker if we actually became ACTIVE */ + if (peering_state.is_active()) { + wait_for_active_blocker.unblock(); + } if (peering_state.needs_recovery()) { logger().info("{}: requesting recovery", @@ -861,43 +868,26 @@ std::ostream& operator<<(std::ostream& os, const PG& pg) return os; } -void PG::mutate_object( - ObjectContextRef& obc, - ceph::os::Transaction& txn, - osd_op_params_t& osd_op_p) +void PG::enqueue_push_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) { - if (obc->obs.exists) { - obc->obs.oi.prior_version = obc->obs.oi.version; - obc->obs.oi.version = osd_op_p.at_version; - if (osd_op_p.user_modify) - obc->obs.oi.user_version = osd_op_p.at_version.version; - obc->obs.oi.last_reqid = osd_op_p.req_id; - obc->obs.oi.mtime = osd_op_p.mtime; - obc->obs.oi.local_mtime = ceph_clock_now(); - - // object_info_t - { - ceph::bufferlist osv; - obc->obs.oi.encode_no_oid(osv, CEPH_FEATURES_ALL); - // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); - txn.setattr(coll_ref->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv); - } + assert(recovery_handler); + assert(recovery_handler->backfill_state); + auto backfill_state = recovery_handler->backfill_state.get(); + backfill_state->enqueue_standalone_push(obj, v, peers); +} - // snapset - if (obc->obs.oi.soid.snap == CEPH_NOSNAP) { - logger().debug("final snapset {} in {}", - obc->ssc->snapset, obc->obs.oi.soid); - ceph::bufferlist bss; - encode(obc->ssc->snapset, bss); - txn.setattr(coll_ref->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss); - obc->ssc->exists = true; - } else { - logger().debug("no snapset (this is a clone)"); - } - } else { - // reset cached ObjectState without enforcing eviction - obc->obs.oi = object_info_t(obc->obs.oi.soid); - } +void PG::enqueue_delete_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers) +{ + assert(recovery_handler); + assert(recovery_handler->backfill_state); + auto backfill_state = recovery_handler->backfill_state.get(); + backfill_state->enqueue_standalone_delete(obj, v, peers); } PG::interruptible_future< @@ -905,6 +895,7 @@ PG::interruptible_future< PG::interruptible_future<>>> PG::submit_transaction( ObjectContextRef&& obc, + ObjectContextRef&& new_clone, ceph::os::Transaction&& txn, osd_op_params_t&& osd_op_p, std::vector<pg_log_entry_t>&& log_entries) @@ -917,17 +908,23 @@ PG::submit_transaction( } epoch_t map_epoch = get_osdmap_epoch(); + auto at_version = osd_op_p.at_version; - peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version); + peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, at_version); peering_state.update_trim_to(); ceph_assert(!log_entries.empty()); ceph_assert(log_entries.rbegin()->version >= projected_last_update); projected_last_update = log_entries.rbegin()->version; + for (const auto& entry: log_entries) { + projected_log.add(entry); + } + auto [submitted, all_completed] = co_await backend->submit_transaction( peering_state.get_acting_recovery_backfill(), obc->obs.oi.soid, + std::move(new_clone), std::move(txn), std::move(osd_op_p), peering_state.get_last_peering_reset(), @@ -936,8 +933,8 @@ PG::submit_transaction( co_return std::make_tuple( std::move(submitted), all_completed.then_interruptible( - [this, last_complete=peering_state.get_info().last_complete, - at_version=osd_op_p.at_version](auto acked) { + [this, last_complete=peering_state.get_info().last_complete, at_version] + (auto acked) { for (const auto& peer : acked) { peering_state.update_peer_last_complete_ondisk( peer.shard, peer.last_complete_ondisk); @@ -1014,8 +1011,15 @@ PG::interruptible_future<eversion_t> PG::submit_error_log( const std::error_code e, ceph_tid_t rep_tid) { - logger().debug("{}: {} rep_tid: {} error: {}", - __func__, *m, rep_tid, e); + // as with submit_executer, need to ensure that log numbering and submission + // are atomic + co_await interruptor::make_interruptible(submit_lock.lock()); + auto unlocker = seastar::defer([this] { + submit_lock.unlock(); + }); + LOG_PREFIX(PG::submit_error_log); + DEBUGDPP("{} rep_tid: {} error: {}", + *this, *m, rep_tid, e); const osd_reqid_t &reqid = m->get_reqid(); mempool::osd_pglog::list<pg_log_entry_t> log_entries; log_entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, @@ -1034,49 +1038,47 @@ PG::interruptible_future<eversion_t> PG::submit_error_log( ceph::os::Transaction t; peering_state.merge_new_log_entries( log_entries, t, peering_state.get_pg_trim_to(), - peering_state.get_min_last_complete_ondisk()); - - return seastar::do_with(log_entries, set<pg_shard_t>{}, - [this, t=std::move(t), rep_tid](auto& log_entries, auto& waiting_on) mutable { - return interruptor::do_for_each(get_acting_recovery_backfill(), - [this, log_entries, waiting_on, rep_tid] - (auto& i) mutable { - pg_shard_t peer(i); - if (peer == pg_whoami) { - return seastar::now(); - } - ceph_assert(peering_state.get_peer_missing().count(peer)); - ceph_assert(peering_state.has_peer_info(peer)); - auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>( - log_entries, - spg_t(peering_state.get_info().pgid.pgid, i.shard), - pg_whoami.shard, - get_osdmap_epoch(), - get_last_peering_reset(), - rep_tid, - peering_state.get_pg_trim_to(), - peering_state.get_min_last_complete_ondisk()); - waiting_on.insert(peer); - logger().debug("submit_error_log: sending log" - "missing_request (rep_tid: {} entries: {})" - " to osd {}", rep_tid, log_entries, peer.osd); - return shard_services.send_to_osd(peer.osd, - std::move(log_m), - get_osdmap_epoch()); - }).then_interruptible([this, waiting_on, t=std::move(t), rep_tid] () mutable { - waiting_on.insert(pg_whoami); - logger().debug("submit_error_log: inserting rep_tid {}", rep_tid); - log_entry_update_waiting_on.insert( - std::make_pair(rep_tid, - log_update_t{std::move(waiting_on)})); - return shard_services.get_store().do_transaction( - get_collection_ref(), std::move(t) - ).then([this] { - peering_state.update_trim_to(); - return seastar::make_ready_future<eversion_t>(projected_last_update); - }); - }); - }); + peering_state.get_pg_committed_to()); + + + set<pg_shard_t> waiting_on; + for (const auto &peer: get_acting_recovery_backfill()) { + if (peer == pg_whoami) { + continue; + } + ceph_assert(peering_state.get_peer_missing().count(peer)); + ceph_assert(peering_state.has_peer_info(peer)); + auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>( + log_entries, + spg_t(peering_state.get_info().pgid.pgid, peer.shard), + pg_whoami.shard, + get_osdmap_epoch(), + get_last_peering_reset(), + rep_tid, + peering_state.get_pg_trim_to(), + peering_state.get_pg_committed_to()); + waiting_on.insert(peer); + + DEBUGDPP("sending log missing_request (rep_tid: {} entries: {}) to osd {}", + *this, rep_tid, log_entries, peer.osd); + co_await interruptor::make_interruptible( + shard_services.send_to_osd( + peer.osd, + std::move(log_m), + get_osdmap_epoch())); + } + waiting_on.insert(pg_whoami); + DEBUGDPP("inserting rep_tid {}", *this, rep_tid); + log_entry_update_waiting_on.insert( + std::make_pair(rep_tid, + log_update_t{std::move(waiting_on)})); + co_await interruptor::make_interruptible( + shard_services.get_store().do_transaction( + get_collection_ref(), std::move(t) + )); + + peering_state.update_trim_to(); + co_return projected_last_update; } PG::run_executer_fut PG::run_executer( @@ -1132,25 +1134,25 @@ PG::submit_executer_fut PG::submit_executer( OpsExecuter &&ox, const std::vector<OSDOp>& ops) { LOG_PREFIX(PG::submit_executer); - // transaction must commit at this point - return std::move( + DEBUGDPP("", *this); + + // we need to build the pg log entries and submit the transaction + // atomically to ensure log ordering + co_await interruptor::make_interruptible(submit_lock.lock()); + auto unlocker = seastar::defer([this] { + submit_lock.unlock(); + }); + + auto [submitted, completed] = co_await std::move( ox - ).flush_changes_n_do_ops_effects( + ).flush_changes_and_submit( ops, snap_mapper, - osdriver, - [FNAME, this](auto&& txn, - auto&& obc, - auto&& osd_op_p, - auto&& log_entries) { - DEBUGDPP("object {} submitting txn", *this, obc->get_oid()); - mutate_object(obc, txn, osd_op_p); - return submit_transaction( - std::move(obc), - std::move(txn), - std::move(osd_op_p), - std::move(log_entries)); - }); + osdriver + ); + co_return std::make_tuple( + std::move(submitted).then_interruptible([unlocker=std::move(unlocker)] {}), + std::move(completed)); } PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m) @@ -1215,31 +1217,6 @@ void PG::check_blocklisted_obc_watchers( } } -PG::load_obc_iertr::future<> -PG::with_locked_obc(const hobject_t &hobj, - const OpInfo &op_info, - with_obc_func_t &&f) -{ - if (__builtin_expect(stopping, false)) { - throw crimson::common::system_shutdown_exception(); - } - const hobject_t oid = get_oid(hobj); - auto wrapper = [f=std::move(f), this](auto head, auto obc) { - check_blocklisted_obc_watchers(obc); - return f(head, obc); - }; - switch (get_lock_type(op_info)) { - case RWState::RWREAD: - return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper)); - case RWState::RWWRITE: - return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper)); - case RWState::RWEXCL: - return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper)); - default: - ceph_abort(); - }; -} - void PG::update_stats(const pg_stat_t &stat) { peering_state.update_stats( [&stat] (auto& history, auto& stats) { @@ -1249,13 +1226,10 @@ void PG::update_stats(const pg_stat_t &stat) { ); } -PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) +PG::handle_rep_op_fut PG::handle_rep_op(Ref<MOSDRepOp> req) { LOG_PREFIX(PG::handle_rep_op); DEBUGDPP("{}", *this, *req); - if (can_discard_replica_op(*req)) { - co_return; - } ceph::os::Transaction txn; auto encoded_txn = req->get_data().cbegin(); @@ -1272,12 +1246,13 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) log_operation(std::move(log_entries), req->pg_trim_to, req->version, - req->min_last_complete_ondisk, + req->pg_committed_to, !txn.empty(), txn, false); DEBUGDPP("{} do_transaction", *this, *req); - co_await interruptor::make_interruptible( + + auto commit_fut = interruptor::make_interruptible( shard_services.get_store().do_transaction(coll_ref, std::move(txn)) ); @@ -1288,10 +1263,7 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req) req.get(), pg_whoami, 0, map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); reply->set_last_complete_ondisk(lcod); - co_await interruptor::make_interruptible( - shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch) - ); - co_return; + co_return handle_rep_op_ret(std::move(commit_fut), std::move(reply)); } PG::interruptible_future<> PG::update_snap_map( @@ -1318,28 +1290,25 @@ void PG::log_operation( std::vector<pg_log_entry_t>&& logv, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, bool transaction_applied, ObjectStore::Transaction &txn, bool async) { - logger().debug("{}", __func__); + LOG_PREFIX(PG::log_operation); + DEBUGDPP("", *this); if (is_primary()) { - ceph_assert(trim_to <= peering_state.get_last_update_ondisk()); + ceph_assert(trim_to <= peering_state.get_pg_committed_to()); } - /* TODO: when we add snap mapper and projected log support, - * we'll likely want to update them here. - * - * See src/osd/PrimaryLogPG.h:log_operation for how classic - * handles these cases. - */ -#if 0 auto last = logv.rbegin(); if (is_primary() && last != logv.rend()) { + DEBUGDPP("on primary, trimming projected log", *this); projected_log.skip_can_rollback_to_to_head(); - projected_log.trim(cct, last->version, nullptr, nullptr, nullptr); + projected_log.trim(shard_services.get_cct(), last->version, + nullptr, nullptr, nullptr); } -#endif + if (!is_primary()) { // && !is_ec_pg() + DEBUGDPP("on replica, clearing obc", *this); replica_clear_repop_obc(logv); } if (!logv.empty()) { @@ -1348,7 +1317,7 @@ void PG::log_operation( peering_state.append_log(std::move(logv), trim_to, roll_forward_to, - min_last_complete_ondisk, + pg_committed_to, txn, !txn.empty(), false); @@ -1356,13 +1325,13 @@ void PG::log_operation( void PG::replica_clear_repop_obc( const std::vector<pg_log_entry_t> &logv) { - logger().debug("{} clearing {} entries", __func__, logv.size()); - for (auto &&e: logv) { - logger().debug(" {} get_object_boundary(from): {} " - " head version(to): {}", - e.soid, - e.soid.get_object_boundary(), - e.soid.get_head()); + LOG_PREFIX(PG::replica_clear_repop_obc); + DEBUGDPP("clearing obc for {} log entries", logv.size()); + for (auto &&e: logv) { + DEBUGDPP("clearing entry for {} from: {} to: {}", + e.soid, + e.soid.get_object_boundary(), + e.soid.get_head()); /* Have to blast all clones, they share a snapset */ obc_registry.clear_range( e.soid.get_object_boundary(), e.soid.get_head()); @@ -1387,17 +1356,17 @@ PG::interruptible_future<> PG::do_update_log_missing( ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING); ObjectStore::Transaction t; - std::optional<eversion_t> op_trim_to, op_roll_forward_to; + std::optional<eversion_t> op_trim_to, op_pg_committed_to; if (m->pg_trim_to != eversion_t()) op_trim_to = m->pg_trim_to; - if (m->pg_roll_forward_to != eversion_t()) - op_roll_forward_to = m->pg_roll_forward_to; - logger().debug("op_trim_to = {}, op_roll_forward_to = {}", + if (m->pg_committed_to != eversion_t()) + op_pg_committed_to = m->pg_committed_to; + logger().debug("op_trim_to = {}, op_pg_committed_to = {}", op_trim_to.has_value() ? *op_trim_to : eversion_t(), - op_roll_forward_to.has_value() ? *op_roll_forward_to : eversion_t()); + op_pg_committed_to.has_value() ? *op_pg_committed_to : eversion_t()); peering_state.append_log_entries_update_missing( - m->entries, t, op_trim_to, op_roll_forward_to); + m->entries, t, op_trim_to, op_pg_committed_to); return interruptor::make_interruptible(shard_services.get_store().do_transaction( coll_ref, std::move(t))).then_interruptible( @@ -1615,14 +1584,21 @@ bool PG::should_send_op( return true; bool should_send = (hoid.pool != (int64_t)get_info().pgid.pool() || - (has_backfill_state() && hoid <= get_last_backfill_started()) || - hoid <= peering_state.get_peer_info(peer).last_backfill); + // An object has been fully pushed to the backfill target if and only if + // either of the following conditions is met: + // 1. peer_info.last_backfill has passed "hoid" + // 2. last_backfill_started has passed "hoid" and "hoid" is not in the peer + // missing set + hoid <= peering_state.get_peer_info(peer).last_backfill || + (has_backfill_state() && hoid <= get_last_backfill_started() && + !is_missing_on_peer(peer, hoid))); if (!should_send) { ceph_assert(is_backfill_target(peer)); logger().debug("{} issue_repop shipping empty opt to osd." "{}, object {} beyond std::max(last_backfill_started, " "peer_info[peer].last_backfill {})", - peer, hoid, peering_state.get_peer_info(peer).last_backfill); + __func__, peer, hoid, + peering_state.get_peer_info(peer).last_backfill); } return should_send; // TODO: should consider async recovery cases in the future which are not supported @@ -1637,8 +1613,8 @@ PG::already_complete(const osd_reqid_t& reqid) int ret; std::vector<pg_log_op_return_item_t> op_returns; - if (peering_state.get_pg_log().get_log().get_request( - reqid, &version, &user_version, &ret, &op_returns)) { + if (check_in_progress_op( + reqid, &version, &user_version, &ret, &op_returns)) { complete_op_t dupinfo{ user_version, version, @@ -1703,4 +1679,19 @@ void PG::C_PG_FinishRecovery::finish(int r) { DEBUGDPP("stale recovery finsher", pg); } } +bool PG::check_in_progress_op( + const osd_reqid_t& reqid, + eversion_t *version, + version_t *user_version, + int *return_code, + std::vector<pg_log_op_return_item_t> *op_returns + ) const +{ + return ( + projected_log.get_request(reqid, version, user_version, return_code, + op_returns) || + peering_state.get_pg_log().get_log().get_request( + reqid, version, user_version, return_code, op_returns)); +} + } diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 604f49005ff..06038c0aa00 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -10,6 +10,7 @@ #include <seastar/core/shared_future.hh> #include "common/dout.h" +#include "common/ostream_temp.h" #include "include/interval_set.h" #include "crimson/net/Fwd.h" #include "messages/MOSDRepOpReply.h" @@ -45,6 +46,7 @@ class MQuery; class OSDMap; class PGBackend; +class ReplicatedBackend; class PGPeeringEvent; class osd_op_params_t; @@ -76,7 +78,8 @@ class PG : public boost::intrusive_ref_counter< using ec_profile_t = std::map<std::string,std::string>; using cached_map_t = OSDMapService::cached_map_t; - ClientRequest::PGPipeline request_pg_pipeline; + CommonPGPipeline request_pg_pipeline; + PGRepopPipeline repop_pipeline; PGPeeringPipeline peering_request_pg_pipeline; ClientRequest::Orderer client_request_orderer; @@ -129,8 +132,8 @@ public: return peering_state.get_pg_trim_to(); } - eversion_t get_min_last_complete_ondisk() const { - return peering_state.get_min_last_complete_ondisk(); + eversion_t get_pg_committed_to() const { + return peering_state.get_pg_committed_to(); } const pg_info_t& get_info() const final { @@ -376,6 +379,7 @@ public: void check_blocklisted_watchers() final; void clear_primary_state() final { recovery_finisher = nullptr; + projected_log = PGLog::IndexedLog(); } void queue_check_readable(epoch_t last_peering_reset, @@ -517,6 +521,9 @@ public: // Utility + bool is_active() const { + return peering_state.is_active(); + } bool is_active_clean() const { return peering_state.is_active() && peering_state.is_clean(); } @@ -589,12 +596,13 @@ public: using with_obc_func_t = std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>; - load_obc_iertr::future<> with_locked_obc( - const hobject_t &hobj, - const OpInfo &op_info, - with_obc_func_t&& f); - - interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m); + using handle_rep_op_ret = std::tuple< + interruptible_future<>, // resolves upon commit + MURef<MOSDRepOpReply> // reply message + >; + // outer future resolves upon submission + using handle_rep_op_fut = interruptible_future<handle_rep_op_ret>; + handle_rep_op_fut handle_rep_op(Ref<MOSDRepOp> m); void update_stats(const pg_stat_t &stat); interruptible_future<> update_snap_map( const std::vector<pg_log_entry_t> &log_entries, @@ -603,7 +611,7 @@ public: std::vector<pg_log_entry_t>&& logv, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_commited_to, bool transaction_applied, ObjectStore::Transaction &txn, bool async = false); @@ -663,6 +671,7 @@ private: const OpInfo &op_info, std::vector<OSDOp>& ops); + seastar::shared_mutex submit_lock; using submit_executer_ret = std::tuple< interruptible_future<>, interruptible_future<>>; @@ -675,13 +684,18 @@ private: struct do_osd_ops_params_t; interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m); + +public: interruptible_future< std::tuple<interruptible_future<>, interruptible_future<>>> submit_transaction( ObjectContextRef&& obc, + ObjectContextRef&& new_clone, ceph::os::Transaction&& txn, osd_op_params_t&& oop, std::vector<pg_log_entry_t>&& log_entries); + +private: interruptible_future<> repair_object( const hobject_t& oid, eversion_t& v); @@ -826,8 +840,15 @@ public: const eversion_t version; const int err; }; + PGLog::IndexedLog projected_log; interruptible_future<std::optional<complete_op_t>> already_complete(const osd_reqid_t& reqid); + bool check_in_progress_op( + const osd_reqid_t& reqid, + eversion_t *version, + version_t *user_version, + int *return_code, + std::vector<pg_log_op_return_item_t> *op_returns) const; int get_recovery_op_priority() const { int64_t pri = 0; get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri); @@ -879,15 +900,20 @@ private: friend class SnapTrimObjSubEvent; private: - void mutate_object( - ObjectContextRef& obc, - ceph::os::Transaction& txn, - osd_op_params_t& osd_op_p); + void enqueue_push_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + void enqueue_delete_for_backfill( + const hobject_t &obj, + const eversion_t &v, + const std::vector<pg_shard_t> &peers); + bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const; bool can_discard_op(const MOSDOp& m) const; void context_registry_on_change(); bool is_missing_object(const hobject_t& soid) const { - return peering_state.get_pg_log().get_missing().get_items().count(soid); + return get_local_missing().is_missing(soid); } bool is_unreadable_object(const hobject_t &oid, eversion_t* v = 0) const final { @@ -895,6 +921,11 @@ private: !peering_state.get_missing_loc().readable_with_acting( oid, get_actingset(), v); } + bool is_missing_on_peer( + const pg_shard_t &peer, + const hobject_t &soid) const { + return peering_state.get_peer_missing(peer).is_missing(soid); + } bool is_degraded_or_backfilling_object(const hobject_t& soid) const; const std::set<pg_shard_t> &get_actingset() const { return peering_state.get_actingset(); @@ -902,6 +933,7 @@ private: private: friend class IOInterruptCondition; + friend class ::ReplicatedBackend; struct log_update_t { std::set<pg_shard_t> waiting_on; seastar::shared_promise<> all_committed; diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc index 24a381b4cf7..79895de06de 100644 --- a/src/crimson/osd/pg_backend.cc +++ b/src/crimson/osd/pg_backend.cc @@ -1283,22 +1283,6 @@ PGBackend::rm_xattr( return rm_xattr_iertr::now(); } -void PGBackend::clone( - /* const */object_info_t& snap_oi, - const ObjectState& os, - const ObjectState& d_os, - ceph::os::Transaction& txn) -{ - // See OpsExecuter::execute_clone documentation - txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid}); - { - ceph::bufferlist bv; - snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL); - txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv); - } - txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR); -} - using get_omap_ertr = crimson::os::FuturizedStore::Shard::read_errorator::extend< crimson::ct_error::enodata>; @@ -1341,9 +1325,10 @@ maybe_get_omap_vals( PGBackend::ll_read_ierrorator::future<ceph::bufferlist> PGBackend::omap_get_header( const crimson::os::CollectionRef& c, - const ghobject_t& oid) const + const ghobject_t& oid, + uint32_t op_flags) const { - return store->omap_get_header(c, oid) + return store->omap_get_header(c, oid, op_flags) .handle_error( crimson::ct_error::enodata::handle([] { return seastar::make_ready_future<bufferlist>(); @@ -1356,10 +1341,13 @@ PGBackend::ll_read_ierrorator::future<> PGBackend::omap_get_header( const ObjectState& os, OSDOp& osd_op, - object_stat_sum_t& delta_stats) const + object_stat_sum_t& delta_stats, + uint32_t op_flags) const { if (os.oi.is_omap()) { - return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible( + return omap_get_header( + coll, ghobject_t{os.oi.soid}, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED + ).safe_then_interruptible( [&delta_stats, &osd_op] (ceph::bufferlist&& header) { osd_op.outdata = std::move(header); delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10); @@ -1723,7 +1711,8 @@ PGBackend::fiemap( CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len) + uint64_t len, + uint32_t op_flags) { return store->fiemap(c, oid, off, len); } @@ -1835,3 +1824,32 @@ PGBackend::read_ierrorator::future<> PGBackend::tmapget( read_errorator::pass_further{}); } +void PGBackend::set_metadata( + const hobject_t &obj, + object_info_t &oi, + const SnapSet *ss /* non-null iff head */, + ceph::os::Transaction& txn) +{ + ceph_assert((obj.is_head() && ss) || (!obj.is_head() && !ss)); + { + ceph::bufferlist bv; + oi.encode_no_oid(bv, CEPH_FEATURES_ALL); + txn.setattr(coll->get_cid(), ghobject_t{obj}, OI_ATTR, bv); + } + if (ss) { + ceph::bufferlist bss; + encode(*ss, bss); + txn.setattr(coll->get_cid(), ghobject_t{obj}, SS_ATTR, bss); + } +} + +void PGBackend::clone_for_write( + const hobject_t &from, + const hobject_t &to, + ceph::os::Transaction &txn) +{ + // See OpsExecuter::execute_clone documentation + txn.clone(coll->get_cid(), ghobject_t{from}, ghobject_t{to}); + txn.rmattr(coll->get_cid(), ghobject_t{to}, SS_ATTR); +} + diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h index fa1f1405ffe..9c2230375b0 100644 --- a/src/crimson/osd/pg_backend.h +++ b/src/crimson/osd/pg_backend.h @@ -308,11 +308,6 @@ public: ObjectState& os, const OSDOp& osd_op, ceph::os::Transaction& trans); - void clone( - /* const */object_info_t& snap_oi, - const ObjectState& os, - const ObjectState& d_os, - ceph::os::Transaction& trans); interruptible_future<struct stat> stat( CollectionRef c, const ghobject_t& oid) const; @@ -320,7 +315,8 @@ public: CollectionRef c, const ghobject_t& oid, uint64_t off, - uint64_t len); + uint64_t len, + uint32_t op_flags = 0); write_iertr::future<> tmapput( ObjectState& os, @@ -380,11 +376,13 @@ public: object_stat_sum_t& delta_stats); ll_read_ierrorator::future<ceph::bufferlist> omap_get_header( const crimson::os::CollectionRef& c, - const ghobject_t& oid) const; + const ghobject_t& oid, + uint32_t op_flags = 0) const; ll_read_ierrorator::future<> omap_get_header( const ObjectState& os, OSDOp& osd_op, - object_stat_sum_t& delta_stats) const; + object_stat_sum_t& delta_stats, + uint32_t op_flags = 0) const; interruptible_future<> omap_set_header( ObjectState& os, const OSDOp& osd_op, @@ -411,9 +409,24 @@ public: ceph::os::Transaction& trans, osd_op_params_t& osd_op_params, object_stat_sum_t& delta_stats); + + /// sets oi and (for head) ss attrs + void set_metadata( + const hobject_t &obj, + object_info_t &oi, + const SnapSet *ss /* non-null iff head */, + ceph::os::Transaction& trans); + + /// clone from->to and clear ss attribute on to + void clone_for_write( + const hobject_t &from, + const hobject_t &to, + ceph::os::Transaction& trans); + virtual rep_op_fut_t submit_transaction(const std::set<pg_shard_t> &pg_shards, const hobject_t& hoid, + crimson::osd::ObjectContextRef&& new_clone, ceph::os::Transaction&& txn, osd_op_params_t&& osd_op_p, epoch_t min_epoch, epoch_t max_epoch, diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index ec3af0d2b00..5eef584c776 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -67,8 +67,6 @@ PGRecovery::start_recovery_ops( if (max_to_start > 0) { max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started); } - using interruptor = - crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; return interruptor::parallel_for_each(started, [] (auto&& ifut) { return std::move(ifut); @@ -609,8 +607,21 @@ void PGRecovery::update_peers_last_backfill( bool PGRecovery::budget_available() const { - // TODO: the limits! - return true; + crimson::osd::scheduler::params_t params = + {1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort}; + auto &ss = pg->get_shard_services(); + auto futopt = ss.try_acquire_throttle_now(std::move(params)); + if (!futopt) { + return true; + } + std::ignore = interruptor::make_interruptible(std::move(*futopt) + ).then_interruptible([this] { + assert(!backfill_state->is_triggered()); + using BackfillState = crimson::osd::BackfillState; + backfill_state->process_event( + BackfillState::ThrottleAcquired{}.intrusive_from_this()); + }); + return false; } void PGRecovery::on_pg_clean() diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h index 705b3176b97..5c7b5c5ef2b 100644 --- a/src/crimson/osd/pg_recovery.h +++ b/src/crimson/osd/pg_recovery.h @@ -25,6 +25,8 @@ class PGBackend; class PGRecovery : public crimson::osd::BackfillState::BackfillListener { public: + using interruptor = + crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>; template <typename T = void> using interruptible_future = RecoveryBackend::interruptible_future<T>; PGRecovery(PGRecoveryListener* pg) : pg(pg) {} @@ -45,6 +47,10 @@ public: seastar::future<> stop() { return seastar::now(); } void on_pg_clean(); + void enqueue_push( + const hobject_t& obj, + const eversion_t& v, + const std::vector<pg_shard_t> &peers) final; private: PGRecoveryListener* pg; size_t start_primary_recovery_ops( @@ -108,10 +114,6 @@ private: const hobject_t& end) final; void request_primary_scan( const hobject_t& begin) final; - void enqueue_push( - const hobject_t& obj, - const eversion_t& v, - const std::vector<pg_shard_t> &peers) final; void enqueue_drop( const pg_shard_t& target, const hobject_t& obj, diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h index b9879c8c9dd..f7bd7a6c08e 100644 --- a/src/crimson/osd/pg_shard_manager.h +++ b/src/crimson/osd/pg_shard_manager.h @@ -256,18 +256,40 @@ public: auto &opref = *op; return opref.template with_blocking_event< PGMap::PGCreationBlockingEvent - >([&target_shard_services, &opref](auto &&trigger) { - return target_shard_services.wait_for_pg( - std::move(trigger), opref.get_pgid()); - }).safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) { - logger.debug("{}: have_pg", opref); - return opref.with_pg(target_shard_services, pgref); - }).handle_error( - crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { - logger.debug("{}: pg creation canceled, dropping", opref); - return seastar::now(); - }) - ).then([op=std::move(op)] {}); + >([&target_shard_services, &opref, &logger](auto &&trigger) mutable { + auto pg = target_shard_services.get_pg(opref.get_pgid()); + auto fut = ShardServices::wait_for_pg_ertr::make_ready_future<Ref<PG>>(pg); + if (!pg) { + if (opref.requires_pg()) { + auto osdmap = target_shard_services.get_map(); + if (!osdmap->is_up_acting_osd_shard( + opref.get_pgid(), target_shard_services.local_state.whoami)) { + logger.debug( + "pg {} for {} is no longer here, discarding", + opref.get_pgid(), opref); + opref.get_handle().exit(); + auto _fut = seastar::now(); + if (osdmap->get_epoch() > opref.get_epoch_sent_at()) { + _fut = target_shard_services.send_incremental_map( + std::ref(opref.get_foreign_connection()), + opref.get_epoch_sent_at() + 1); + } + return _fut; + } + } + fut = target_shard_services.wait_for_pg( + std::move(trigger), opref.get_pgid()); + } + return fut.safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(target_shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ); + }).then([op=std::move(op)] {}); } seastar::future<> load_pgs(crimson::os::FuturizedStore& store); diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc index cbb8c883e07..6c8abecffaf 100644 --- a/src/crimson/osd/replicated_backend.cc +++ b/src/crimson/osd/replicated_backend.cc @@ -36,19 +36,59 @@ ReplicatedBackend::_read(const hobject_t& hoid, return store->read(coll, ghobject_t{hoid}, off, len, flags); } +MURef<MOSDRepOp> ReplicatedBackend::new_repop_msg( + const pg_shard_t &pg_shard, + const hobject_t &hoid, + const bufferlist &encoded_txn, + const osd_op_params_t &osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + const std::vector<pg_log_entry_t> &log_entries, + bool send_op, + ceph_tid_t tid) +{ + ceph_assert(pg_shard != whoami); + auto m = crimson::make_message<MOSDRepOp>( + osd_op_p.req_id, + whoami, + spg_t{pgid, pg_shard.shard}, + hoid, + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, + map_epoch, + min_epoch, + tid, + osd_op_p.at_version); + if (send_op) { + m->set_data(encoded_txn); + } else { + ceph::os::Transaction t; + bufferlist bl; + encode(t, bl); + m->set_data(bl); + } + encode(log_entries, m->logbl); + m->pg_trim_to = osd_op_p.pg_trim_to; + m->pg_committed_to = osd_op_p.pg_committed_to; + m->pg_stats = pg.get_info().stats; + return m; +} + ReplicatedBackend::rep_op_fut_t -ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards, - const hobject_t& hoid, - ceph::os::Transaction&& t, - osd_op_params_t&& opp, - epoch_t min_epoch, epoch_t map_epoch, - std::vector<pg_log_entry_t>&& logv) +ReplicatedBackend::submit_transaction( + const std::set<pg_shard_t> &pg_shards, + const hobject_t& hoid, + crimson::osd::ObjectContextRef &&new_clone, + ceph::os::Transaction&& t, + osd_op_params_t&& opp, + epoch_t min_epoch, epoch_t map_epoch, + std::vector<pg_log_entry_t>&& logv) { LOG_PREFIX(ReplicatedBackend::submit_transaction); DEBUGDPP("object {}", dpp, hoid); auto log_entries = std::move(logv); auto txn = std::move(t); auto osd_op_p = std::move(opp); + auto _new_clone = std::move(new_clone); const ceph_tid_t tid = shard_services.get_tid(); auto pending_txn = @@ -56,50 +96,57 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards, bufferlist encoded_txn; encode(txn, encoded_txn); + bool is_delete = false; for (auto &le : log_entries) { le.mark_unrollbackable(); + if (le.is_delete()) { + is_delete = true; + } } + co_await pg.update_snap_map(log_entries, txn); + + std::vector<pg_shard_t> to_push_clone; + std::vector<pg_shard_t> to_push_delete; auto sends = std::make_unique<std::vector<seastar::future<>>>(); - for (auto pg_shard : pg_shards) { - if (pg_shard != whoami) { - auto m = crimson::make_message<MOSDRepOp>( - osd_op_p.req_id, - whoami, - spg_t{pgid, pg_shard.shard}, - hoid, - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK, - map_epoch, - min_epoch, - tid, - osd_op_p.at_version); - if (pg.should_send_op(pg_shard, hoid)) { - m->set_data(encoded_txn); - } else { - ceph::os::Transaction t; - bufferlist bl; - encode(t, bl); - m->set_data(bl); + for (auto &pg_shard : pg_shards) { + if (pg_shard == whoami) { + continue; + } + MURef<MOSDRepOp> m; + if (pg.should_send_op(pg_shard, hoid)) { + m = new_repop_msg( + pg_shard, hoid, encoded_txn, osd_op_p, + min_epoch, map_epoch, log_entries, true, tid); + } else { + m = new_repop_msg( + pg_shard, hoid, encoded_txn, osd_op_p, + min_epoch, map_epoch, log_entries, false, tid); + if (pg.is_missing_on_peer(pg_shard, hoid)) { + if (_new_clone) { + // The head is in the push queue but hasn't been pushed yet. + // We need to ensure that the newly created clone will be + // pushed as well, otherwise we might skip it. + // See: https://tracker.ceph.com/issues/68808 + to_push_clone.push_back(pg_shard); + } + if (is_delete) { + to_push_delete.push_back(pg_shard); + } } - pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); - encode(log_entries, m->logbl); - m->pg_trim_to = osd_op_p.pg_trim_to; - m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; - m->pg_stats = pg.get_info().stats; - // TODO: set more stuff. e.g., pg_states - sends->emplace_back( - shard_services.send_to_osd( - pg_shard.osd, std::move(m), map_epoch)); } + pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); + // TODO: set more stuff. e.g., pg_states + sends->emplace_back( + shard_services.send_to_osd( + pg_shard.osd, std::move(m), map_epoch)); } - co_await pg.update_snap_map(log_entries, txn); - pg.log_operation( std::move(log_entries), osd_op_p.pg_trim_to, osd_op_p.at_version, - osd_op_p.min_last_complete_ondisk, + osd_op_p.pg_committed_to, true, txn, false); @@ -120,9 +167,20 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards, return seastar::now(); } return peers->all_committed.get_shared_future(); - }).then_interruptible([pending_txn, this] { + }).then_interruptible([pending_txn, this, _new_clone, &hoid, + to_push_delete=std::move(to_push_delete), + to_push_clone=std::move(to_push_clone)] { auto acked_peers = std::move(pending_txn->second.acked_peers); pending_trans.erase(pending_txn); + if (_new_clone && !to_push_clone.empty()) { + pg.enqueue_push_for_backfill( + _new_clone->obs.oi.soid, + _new_clone->obs.oi.version, + to_push_clone); + } + if (!to_push_delete.empty()) { + pg.enqueue_delete_for_backfill(hoid, {}, to_push_delete); + } return seastar::make_ready_future< crimson::osd::acked_peers_t>(std::move(acked_peers)); }); diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h index fb8704d8742..d5844b23a0c 100644 --- a/src/crimson/osd/replicated_backend.h +++ b/src/crimson/osd/replicated_backend.h @@ -35,6 +35,7 @@ private: rep_op_fut_t submit_transaction( const std::set<pg_shard_t> &pg_shards, const hobject_t& hoid, + crimson::osd::ObjectContextRef&& new_clone, ceph::os::Transaction&& txn, osd_op_params_t&& osd_op_p, epoch_t min_epoch, epoch_t max_epoch, @@ -60,6 +61,17 @@ private: pending_transactions_t pending_trans; crimson::osd::PG& pg; + MURef<MOSDRepOp> new_repop_msg( + const pg_shard_t &pg_shard, + const hobject_t &hoid, + const bufferlist &encoded_txn, + const osd_op_params_t &osd_op_p, + epoch_t min_epoch, + epoch_t map_epoch, + const std::vector<pg_log_entry_t> &log_entries, + bool send_op, + ceph_tid_t tid); + seastar::future<> request_committed( const osd_reqid_t& reqid, const eversion_t& at_version) final; }; diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc index 76f24196b51..0d6c9d38236 100644 --- a/src/crimson/osd/replicated_recovery_backend.cc +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -35,6 +35,15 @@ ReplicatedRecoveryBackend::recover_object( logger().debug("recover_object: loading obc: {}", soid); return pg.obc_loader.with_obc<RWState::RWREAD>(soid, [this, soid, need](auto head, auto obc) { + if (!obc->obs.exists) { + // XXX: this recovery must be triggered by backfills and the corresponding + // object must have been deleted by some client request after the object + // is enqueued for push but before the lock is acquired by the recovery. + // + // Abort the recovery in this case, a "recover_delete" must have been + // added for this object by the client request that deleted it. + return interruptor::now(); + } logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid); auto& recovery_waiter = get_recovering(soid); recovery_waiter.obc = obc; @@ -306,7 +315,10 @@ ReplicatedRecoveryBackend::recover_delete( } return seastar::make_ready_future<>(); }).then_interruptible([this, soid, &stat_diff] { - pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + const auto &missing = pg.get_peering_state().get_pg_log().get_missing(); + if (!missing.is_missing(soid)) { + pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true); + } return seastar::make_ready_future<>(); }); }); @@ -568,14 +580,17 @@ ReplicatedRecoveryBackend::read_metadata_for_push_op( return seastar::make_ready_future<eversion_t>(ver); } return interruptor::make_interruptible(interruptor::when_all_succeed( - backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>( + backend->omap_get_header( + coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED + ).handle_error_interruptible<false>( crimson::os::FuturizedStore::Shard::read_errorator::all_same_way( [oid] (const std::error_code& e) { logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid); return seastar::make_ready_future<bufferlist>(); })), - interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid))) - .handle_error_interruptible<false>( + interruptor::make_interruptible( + store->get_attrs(coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + ).handle_error_interruptible<false>( crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way( [oid] (const std::error_code& e) { logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid); @@ -613,8 +628,14 @@ ReplicatedRecoveryBackend::read_object_for_push_op( return seastar::make_ready_future<uint64_t>(offset); } // 1. get the extents in the interested range - return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid}, - 0, copy_subset.range_end())).safe_then_interruptible( + return interruptor::make_interruptible( + backend->fiemap( + coll, + ghobject_t{oid}, + 0, + copy_subset.range_end(), + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + ).safe_then_interruptible( [=, this](auto&& fiemap_included) mutable { interval_set<uint64_t> extents; try { @@ -630,8 +651,12 @@ ReplicatedRecoveryBackend::read_object_for_push_op( push_op->data_included.span_of(extents, offset, max_len); // 3. read the truncated extents // TODO: check if the returned extents are pruned - return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid}, - push_op->data_included, 0)); + return interruptor::make_interruptible( + store->readv( + coll, + ghobject_t{oid}, + push_op->data_included, + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)); }).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) { push_op->data.claim_append(std::move(bl)); uint64_t recovered_to = 0; diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index a053d9d5044..e1acb34636f 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -783,6 +783,11 @@ seastar::future<> ShardServices::dispatch_context_transaction( co_return; } +Ref<PG> ShardServices::get_pg(spg_t pgid) +{ + return local_state.get_pg(pgid); +} + seastar::future<> ShardServices::dispatch_context_messages( BufferedRecoveryMessages &&ctx) { @@ -802,15 +807,19 @@ seastar::future<> ShardServices::dispatch_context_messages( seastar::future<> ShardServices::dispatch_context( crimson::os::CollectionRef col, - PeeringCtx &&ctx) -{ - ceph_assert(col || ctx.transaction.empty()); - return seastar::when_all_succeed( - dispatch_context_messages( - BufferedRecoveryMessages{ctx}), - col ? dispatch_context_transaction(col, ctx) : seastar::now() - ).then_unpack([] { - return seastar::now(); + PeeringCtx &&pctx) +{ + return seastar::do_with( + std::move(pctx), + [this, col](auto &ctx) { + ceph_assert(col || ctx.transaction.empty()); + return seastar::when_all_succeed( + dispatch_context_messages( + BufferedRecoveryMessages{ctx}), + col ? dispatch_context_transaction(col, ctx) : seastar::now() + ).then_unpack([] { + return seastar::now(); + }); }); } diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h index fb86418aba2..f1ed9b8d911 100644 --- a/src/crimson/osd/shard_services.h +++ b/src/crimson/osd/shard_services.h @@ -10,6 +10,7 @@ #include "include/common_fwd.h" #include "osd_operation.h" +#include "osd/osd_types_fmt.h" #include "msg/MessageRef.h" #include "crimson/common/exception.h" #include "crimson/common/shared_lru.h" @@ -482,6 +483,8 @@ public: return pg_to_shard_mapping.remove_pg_mapping(pgid); } + Ref<PG> get_pg(spg_t pgid); + crimson::common::CephContext *get_cct() { return &(local_state.cct); } @@ -588,6 +591,7 @@ public: FORWARD_TO_OSD_SINGLETON(get_pool_info) FORWARD(with_throttle_while, with_throttle_while, local_state.throttler) + FORWARD(try_acquire_throttle_now, try_acquire_throttle_now, local_state.throttler) FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg) FORWARD_TO_OSD_SINGLETON(send_incremental_map) diff --git a/src/crimson/tools/perf_crimson_msgr.cc b/src/crimson/tools/perf_crimson_msgr.cc index e5f56361fff..5623438f821 100644 --- a/src/crimson/tools/perf_crimson_msgr.cc +++ b/src/crimson/tools/perf_crimson_msgr.cc @@ -1,6 +1,7 @@ // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab +#include <iomanip> #include <map> #include <boost/program_options.hpp> #include <boost/iterator/counting_iterator.hpp> diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc index 7af0d996caa..870809c5153 100644 --- a/src/crimson/tools/store_nbd/tm_driver.cc +++ b/src/crimson/tools/store_nbd/tm_driver.cc @@ -25,6 +25,7 @@ seastar::future<> TMDriver::write( return tm->with_transaction_intr( Transaction::src_t::MUTATE, "write", + CACHE_HINT_TOUCH, [this, offset, &ptr](auto& t) { return tm->remove(t, laddr_t::from_byte_offset(offset) @@ -82,11 +83,14 @@ TMDriver::read_extents_ret TMDriver::read_extents( return tm->read_pin<TestBlock>( t, std::move(pin) - ).si_then([&ret](auto ref) mutable { - ret.push_back(std::make_pair(ref->get_laddr(), ref)); + ).si_then([&ret](auto maybe_indirect_extent) mutable { + assert(!maybe_indirect_extent.is_indirect()); + assert(!maybe_indirect_extent.is_clone); + auto& e = maybe_indirect_extent.extent; + ret.push_back(std::make_pair(e->get_laddr(), e)); logger().debug( "read_extents: got extent {}", - *ref); + *e); return seastar::now(); }); }).si_then([&ret] { @@ -109,6 +113,7 @@ seastar::future<bufferlist> TMDriver::read( return tm->with_transaction_intr( Transaction::src_t::READ, "read", + CACHE_HINT_TOUCH, [=, &blret, this](auto& t) { return read_extents(t, laddr_t::from_byte_offset(offset), size |