summaryrefslogtreecommitdiffstats
path: root/src/crimson/os
diff options
context:
space:
mode:
Diffstat (limited to 'src/crimson/os')
-rw-r--r--src/crimson/os/alienstore/alien_store.cc57
-rw-r--r--src/crimson/os/alienstore/alien_store.h36
-rw-r--r--src/crimson/os/alienstore/thread_pool.cc3
-rw-r--r--src/crimson/os/cyanstore/cyan_store.cc34
-rw-r--r--src/crimson/os/cyanstore/cyan_store.h29
-rw-r--r--src/crimson/os/futurized_store.h28
-rw-r--r--src/crimson/os/seastore/CMakeLists.txt3
-rw-r--r--src/crimson/os/seastore/async_cleaner.cc47
-rw-r--r--src/crimson/os/seastore/async_cleaner.h15
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.cc14
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.h30
-rw-r--r--src/crimson/os/seastore/backref_entry.h127
-rw-r--r--src/crimson/os/seastore/backref_manager.h1
-rw-r--r--src/crimson/os/seastore/backref_mapping.h27
-rw-r--r--src/crimson/os/seastore/btree/btree_range_pin.cc54
-rw-r--r--src/crimson/os/seastore/btree/btree_range_pin.h29
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_btree.h10
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_node.h46
-rw-r--r--src/crimson/os/seastore/cache.cc372
-rw-r--r--src/crimson/os/seastore/cache.h619
-rw-r--r--src/crimson/os/seastore/cached_extent.cc210
-rw-r--r--src/crimson/os/seastore/cached_extent.h368
-rw-r--r--src/crimson/os/seastore/collection_manager/collection_flat_node.h2
-rw-r--r--src/crimson/os/seastore/collection_manager/flat_collection_manager.cc7
-rw-r--r--src/crimson/os/seastore/device.cc2
-rw-r--r--src/crimson/os/seastore/extent_placement_manager.cc4
-rw-r--r--src/crimson/os/seastore/extent_placement_manager.h24
-rw-r--r--src/crimson/os/seastore/extentmap_manager.cc33
-rw-r--r--src/crimson/os/seastore/journal.h15
-rw-r--r--src/crimson/os/seastore/journal/circular_bounded_journal.cc53
-rw-r--r--src/crimson/os/seastore/journal/circular_bounded_journal.h13
-rw-r--r--src/crimson/os/seastore/journal/record_submitter.cc4
-rw-r--r--src/crimson/os/seastore/journal/segment_allocator.cc4
-rw-r--r--src/crimson/os/seastore/journal/segmented_journal.cc37
-rw-r--r--src/crimson/os/seastore/journal/segmented_journal.h15
-rw-r--r--src/crimson/os/seastore/lba_manager.h1
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc53
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h16
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h2
-rw-r--r--src/crimson/os/seastore/lba_mapping.cc44
-rw-r--r--src/crimson/os/seastore/lba_mapping.h73
-rw-r--r--src/crimson/os/seastore/object_data_handler.cc357
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h3
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc25
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h40
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h18
-rw-r--r--src/crimson/os/seastore/onode.cc2
-rw-r--r--src/crimson/os/seastore/onode.h4
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h10
-rw-r--r--src/crimson/os/seastore/random_block_manager/block_rb_manager.cc6
-rw-r--r--src/crimson/os/seastore/record_scanner.cc16
-rw-r--r--src/crimson/os/seastore/root_block.h2
-rw-r--r--src/crimson/os/seastore/root_meta.h76
-rw-r--r--src/crimson/os/seastore/seastore.cc60
-rw-r--r--src/crimson/os/seastore/seastore.h40
-rw-r--r--src/crimson/os/seastore/seastore_types.cc36
-rw-r--r--src/crimson/os/seastore/seastore_types.h146
-rw-r--r--src/crimson/os/seastore/segment_manager.cc12
-rw-r--r--src/crimson/os/seastore/segment_manager/block.cc58
-rw-r--r--src/crimson/os/seastore/segment_manager/ephemeral.cc15
-rw-r--r--src/crimson/os/seastore/segment_manager/zbd.cc43
-rw-r--r--src/crimson/os/seastore/segment_manager_group.cc10
-rw-r--r--src/crimson/os/seastore/transaction.h91
-rw-r--r--src/crimson/os/seastore/transaction_interruptor.cc15
-rw-r--r--src/crimson/os/seastore/transaction_interruptor.h77
-rw-r--r--src/crimson/os/seastore/transaction_manager.cc215
-rw-r--r--src/crimson/os/seastore/transaction_manager.h306
67 files changed, 2747 insertions, 1497 deletions
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
index 3fd2bb1fd15..db6decd84f9 100644
--- a/src/crimson/os/alienstore/alien_store.cc
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -141,7 +141,8 @@ seastar::future<> AlienStore::stop()
AlienStore::base_errorator::future<bool>
AlienStore::exists(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
return op_gates.simple_dispatch("exists", [=, this] {
return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
@@ -212,7 +213,8 @@ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
AlienStore::list_objects(CollectionRef ch,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const
+ uint64_t limit,
+ uint32_t op_flags) const
{
logger().debug("{}", __func__);
assert(tp);
@@ -348,7 +350,8 @@ AlienStore::readv(CollectionRef ch,
AlienStore::get_attr_errorator::future<ceph::bufferlist>
AlienStore::get_attr(CollectionRef ch,
const ghobject_t& oid,
- std::string_view name) const
+ std::string_view name,
+ uint32_t op_flags) const
{
logger().debug("{}", __func__);
assert(tp);
@@ -376,7 +379,8 @@ AlienStore::get_attr(CollectionRef ch,
AlienStore::get_attrs_ertr::future<AlienStore::attrs_t>
AlienStore::get_attrs(CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
logger().debug("{}", __func__);
assert(tp);
@@ -397,7 +401,8 @@ AlienStore::get_attrs(CollectionRef ch,
auto AlienStore::omap_get_values(CollectionRef ch,
const ghobject_t& oid,
- const set<string>& keys)
+ const set<string>& keys,
+ uint32_t op_flags)
-> read_errorator::future<omap_values_t>
{
logger().debug("{}", __func__);
@@ -421,7 +426,8 @@ auto AlienStore::omap_get_values(CollectionRef ch,
auto AlienStore::omap_get_values(CollectionRef ch,
const ghobject_t &oid,
- const std::optional<string> &start)
+ const std::optional<string> &start,
+ uint32_t op_flags)
-> read_errorator::future<std::tuple<bool, omap_values_t>>
{
logger().debug("{} with_start", __func__);
@@ -429,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch,
return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) {
return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] {
auto c = static_cast<AlienCollection*>(ch.get());
- return store->omap_get_values(c->collection, oid, start,
- reinterpret_cast<map<string, bufferlist>*>(&values));
+ return store->omap_iterate(
+ c->collection, oid,
+ ObjectStore::omap_iter_seek_t{
+ .seek_position = start.value_or(std::string{}),
+ // FIXME: classical OSDs begins iteration from LOWER_BOUND
+ // (or UPPER_BOUND if filter_prefix > start). However, these
+ // bits are not implemented yet
+ .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND
+ },
+ [&values]
+ (std::string_view key, std::string_view value) mutable {
+ values[std::string{key}].append(value);
+ // FIXME: there is limit on number of entries yet
+ return ObjectStore::omap_iter_ret_t::NEXT;
+ });
}).then([&values] (int r)
-> read_errorator::future<std::tuple<bool, omap_values_t>> {
if (r == -ENOENT) {
@@ -578,7 +597,8 @@ unsigned AlienStore::get_max_attr_name_length() const
seastar::future<struct stat> AlienStore::stat(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
assert(tp);
return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) {
@@ -590,8 +610,22 @@ seastar::future<struct stat> AlienStore::stat(
});
}
+seastar::future<std::string> AlienStore::get_default_device_class()
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return op_gates.simple_dispatch("get_default_device_class", [=, this] {
+ return tp->submit([=, this] {
+ return store->get_default_device_class();
+ }).then([] (std::string device_class) {
+ return seastar::make_ready_future<std::string>(device_class);
+ });
+ });
+}
+
auto AlienStore::omap_get_header(CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
-> get_attr_errorator::future<ceph::bufferlist>
{
assert(tp);
@@ -617,7 +651,8 @@ AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fie
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
assert(tp);
return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) {
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
index d36f449afd8..1d39411450e 100644
--- a/src/crimson/os/alienstore/alien_store.h
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -36,7 +36,8 @@ public:
base_errorator::future<bool> exists(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
read_errorator::future<ceph::bufferlist> read(CollectionRef c,
const ghobject_t& oid,
@@ -49,29 +50,36 @@ public:
uint32_t op_flags = 0) final;
- get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c,
- const ghobject_t& oid,
- std::string_view name) const final;
- get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c,
- const ghobject_t& oid) final;
+ get_attr_errorator::future<ceph::bufferlist> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name,
+ uint32_t op_flags = 0) const final;
+ get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) final;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) final;
/// Retrieves paged set of values > start (if present)
read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) final; ///< @return <done, values> values.empty() iff done
seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const final;
+ uint64_t limit,
+ uint32_t op_flags = 0) const final;
seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
@@ -97,15 +105,19 @@ public:
unsigned get_max_attr_name_length() const final;
seastar::future<struct stat> stat(
CollectionRef,
- const ghobject_t&) final;
+ const ghobject_t&,
+ uint32_t op_flags = 0) final;
+ seastar::future<std::string> get_default_device_class() final;
get_attr_errorator::future<ceph::bufferlist> omap_get_header(
CollectionRef,
- const ghobject_t&) final;
+ const ghobject_t&,
+ uint32_t) final;
read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
CollectionRef,
const ghobject_t&,
uint64_t off,
- uint64_t len) final;
+ uint64_t len,
+ uint32_t op_flags) final;
FuturizedStore::Shard& get_sharded_store() final {
return *this;
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
index 5cf9590e61e..2d208548b32 100644
--- a/src/crimson/os/alienstore/thread_pool.cc
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -7,6 +7,7 @@
#include <pthread.h>
#include "include/ceph_assert.h"
+#include "include/intarith.h" // for round_up_to()
#include "crimson/common/config_proxy.h"
using crimson::common::local_conf;
@@ -27,7 +28,7 @@ ThreadPool::ThreadPool(size_t n_threads,
pin(*cpus);
}
block_sighup();
- (void) pthread_setname_np(pthread_self(), "alien-store-tp");
+ (void) ceph_pthread_setname("alien-store-tp");
loop(queue_max_wait, i);
});
}
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
index 7b945e5aa15..41819fb5eb6 100644
--- a/src/crimson/os/cyanstore/cyan_store.cc
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -12,6 +12,7 @@
#include "crimson/common/buffer_io.h"
#include "crimson/common/config_proxy.h"
+#include "crimson/common/perf_counters_collection.h"
#include "cyan_collection.h"
#include "cyan_object.h"
@@ -143,6 +144,12 @@ CyanStore::list_collections()
});
}
+seastar::future<std::string>
+CyanStore::get_default_device_class()
+{
+ return seastar::make_ready_future<std::string>("");
+}
+
CyanStore::mount_ertr::future<> CyanStore::Shard::mount()
{
static const char read_file_errmsg[]{"read_file"};
@@ -201,7 +208,8 @@ CyanStore::Shard::list_objects(
CollectionRef ch,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const
+ uint64_t limit,
+ uint32_t op_flags) const
{
auto c = static_cast<Collection*>(ch.get());
logger().debug("{} {} {} {} {}",
@@ -250,7 +258,8 @@ CyanStore::Shard::list_collections()
CyanStore::Shard::base_errorator::future<bool>
CyanStore::Shard::exists(
CollectionRef ch,
- const ghobject_t &oid)
+ const ghobject_t &oid,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
if (!c->exists) {
@@ -326,7 +335,8 @@ CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
CyanStore::Shard::get_attr(
CollectionRef ch,
const ghobject_t& oid,
- std::string_view name) const
+ std::string_view name,
+ uint32_t op_flags) const
{
auto c = static_cast<Collection*>(ch.get());
logger().debug("{} {} {}",
@@ -345,7 +355,8 @@ CyanStore::Shard::get_attr(
CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t>
CyanStore::Shard::get_attrs(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
logger().debug("{} {} {}",
@@ -360,7 +371,8 @@ CyanStore::Shard::get_attrs(
auto CyanStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t& oid,
- const omap_keys_t& keys)
+ const omap_keys_t& keys,
+ uint32_t op_flags)
-> read_errorator::future<omap_values_t>
{
auto c = static_cast<Collection*>(ch.get());
@@ -381,7 +393,8 @@ auto CyanStore::Shard::omap_get_values(
auto CyanStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t &oid,
- const std::optional<string> &start)
+ const std::optional<string> &start,
+ uint32_t op_flags)
-> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>>
{
auto c = static_cast<Collection*>(ch.get());
@@ -402,7 +415,8 @@ auto CyanStore::Shard::omap_get_values(
auto CyanStore::Shard::omap_get_header(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
-> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
{
auto c = static_cast<Collection*>(ch.get());
@@ -970,7 +984,8 @@ CyanStore::Shard::fiemap(
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
@@ -985,7 +1000,8 @@ CyanStore::Shard::fiemap(
seastar::future<struct stat>
CyanStore::Shard::stat(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
auto o = c->get_object(oid);
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
index 99583d07d36..1d481ef5829 100644
--- a/src/crimson/os/cyanstore/cyan_store.h
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -34,11 +34,13 @@ public:
seastar::future<struct stat> stat(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
base_errorator::future<bool> exists(
CollectionRef ch,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<ceph::bufferlist> read(
CollectionRef c,
@@ -56,33 +58,39 @@ public:
get_attr_errorator::future<ceph::bufferlist> get_attr(
CollectionRef c,
const ghobject_t& oid,
- std::string_view name) const final;
+ std::string_view name,
+ uint32_t op_flags = 0) const final;
get_attrs_ertr::future<attrs_t> get_attrs(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) final;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) final;
read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) final;
get_attr_errorator::future<ceph::bufferlist> omap_get_header(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
list_objects(
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const final;
+ uint64_t limit,
+ uint32_t op_flags = 0) const final;
seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
@@ -101,7 +109,8 @@ public:
CollectionRef c,
const ghobject_t& oid,
uint64_t off,
- uint64_t len) final;
+ uint64_t len,
+ uint32_t op_flags) final;
unsigned get_max_attr_name_length() const final;
@@ -221,6 +230,8 @@ public:
seastar::future<std::vector<coll_core_t>> list_collections() final;
+ seastar::future<std::string> get_default_device_class() final;
+
private:
seastar::sharded<CyanStore::Shard> shard_stores;
const std::string path;
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index 0dca695ba3a..e7d4c8546de 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -54,7 +54,8 @@ public:
virtual base_errorator::future<bool> exists(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
using get_attr_errorator = crimson::errorator<
crimson::ct_error::enoent,
@@ -62,42 +63,49 @@ public:
virtual get_attr_errorator::future<ceph::bufferlist> get_attr(
CollectionRef c,
const ghobject_t& oid,
- std::string_view name) const = 0;
+ std::string_view name,
+ uint32_t op_flags = 0) const = 0;
using get_attrs_ertr = crimson::errorator<
crimson::ct_error::enoent>;
using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>;
virtual get_attrs_ertr::future<attrs_t> get_attrs(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
virtual seastar::future<struct stat> stat(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
using omap_values_t = attrs_t;
using omap_keys_t = std::set<std::string>;
virtual read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) = 0;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) = 0;
using omap_values_paged_t = std::tuple<bool, omap_values_t>;
virtual read_errorator::future<omap_values_paged_t> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) = 0; ///< @return <done, values> values.empty() only if done
virtual get_attr_errorator::future<bufferlist> omap_get_header(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const = 0;
+ uint64_t limit,
+ uint32_t op_flags = 0) const = 0;
virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0;
@@ -153,7 +161,8 @@ public:
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len) = 0;
+ uint64_t len,
+ uint32_t op_flags = 0) = 0;
virtual unsigned get_max_attr_name_length() const = 0;
};
@@ -203,6 +212,7 @@ public:
using coll_core_t = std::pair<coll_t, core_id_t>;
virtual seastar::future<std::vector<coll_core_t>> list_collections() = 0;
+ virtual seastar::future<std::string> get_default_device_class() = 0;
protected:
const core_id_t primary_core;
};
diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt
index 4bdbab8c4e5..3da5e65ceec 100644
--- a/src/crimson/os/seastore/CMakeLists.txt
+++ b/src/crimson/os/seastore/CMakeLists.txt
@@ -1,9 +1,11 @@
set(crimson_seastore_srcs
cached_extent.cc
+ lba_mapping.cc
seastore_types.cc
segment_manager.cc
segment_manager/ephemeral.cc
segment_manager/block.cc
+ transaction_interruptor.cc
transaction_manager.cc
transaction.cc
cache.cc
@@ -18,7 +20,6 @@ set(crimson_seastore_srcs
omap_manager.cc
omap_manager/btree/btree_omap_manager.cc
omap_manager/btree/omap_btree_node_impl.cc
- btree/btree_range_pin.cc
btree/fixed_kv_node.cc
onode.cc
onode_manager/staged-fltree/node.cc
diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc
index 5046980eae5..64e6749562e 100644
--- a/src/crimson/os/seastore/async_cleaner.cc
+++ b/src/crimson/os/seastore/async_cleaner.cc
@@ -131,7 +131,7 @@ void segments_info_t::add_segment_manager(
auto ssize = segment_manager.get_segment_size();
auto nsegments = segment_manager.get_num_segments();
auto sm_size = segment_manager.get_available_size();
- INFO("adding segment manager {}, size={}, ssize={}, segments={}",
+ INFO("adding segment manager {}, size=0x{:x}, segment size=0x{:x}, segments={}",
device_id_printer_t{d_id}, sm_size, ssize, nsegments);
ceph_assert(ssize > 0);
ceph_assert(nsegments > 0);
@@ -329,9 +329,9 @@ std::ostream &operator<<(std::ostream &os, const segments_info_t &infos)
<< ", closed=" << infos.get_num_closed()
<< ", type_journal=" << infos.get_num_type_journal()
<< ", type_ool=" << infos.get_num_type_ool()
- << ", total=" << infos.get_total_bytes() << "B"
- << ", available=" << infos.get_available_bytes() << "B"
- << ", unavailable=" << infos.get_unavailable_bytes() << "B"
+ << ", total=0x" << std::hex << infos.get_total_bytes() << "B"
+ << ", available=0x" << infos.get_available_bytes() << "B"
+ << ", unavailable=0x" << infos.get_unavailable_bytes() << "B" << std::dec
<< ", available_ratio=" << infos.get_available_ratio()
<< ", submitted_head=" << infos.get_submitted_journal_head()
<< ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()}
@@ -609,6 +609,7 @@ JournalTrimmerImpl::trim_alloc()
return extent_callback->with_transaction_intr(
Transaction::src_t::TRIM_ALLOC,
"trim_alloc",
+ CACHE_HINT_NOCACHE,
[this, FNAME](auto &t)
{
auto target = get_alloc_tail_target();
@@ -653,6 +654,7 @@ JournalTrimmerImpl::trim_dirty()
return extent_callback->with_transaction_intr(
Transaction::src_t::TRIM_DIRTY,
"trim_dirty",
+ CACHE_HINT_NOCACHE,
[this, FNAME](auto &t)
{
auto target = get_dirty_tail_target();
@@ -765,10 +767,10 @@ int64_t SpaceTrackerDetailed::SegmentMap::allocate(
for (auto i = b; i < e; ++i) {
if (bitmap[i]) {
if (!error) {
- ERROR("found allocated in {}, {} ~ {}", segment, offset, len);
+ ERROR("found allocated in {}, 0x{:x}~0x{:x}", segment, offset, len);
error = true;
}
- DEBUG("block {} allocated", i * block_size);
+ DEBUG("block 0x{:x}B allocated", i * block_size);
}
bitmap[i] = true;
}
@@ -792,10 +794,10 @@ int64_t SpaceTrackerDetailed::SegmentMap::release(
for (auto i = b; i < e; ++i) {
if (!bitmap[i]) {
if (!error) {
- ERROR("found unallocated in {}, {} ~ {}", segment, offset, len);
+ ERROR("found unallocated in {}, 0x{:x}~0x{:x}", segment, offset, len);
error = true;
}
- DEBUG("block {} unallocated", i * block_size);
+ DEBUG("block 0x{:x}B unallocated", i * block_size);
}
bitmap[i] = false;
}
@@ -831,7 +833,7 @@ void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const
INFO("dump start");
for (unsigned i = 0; i < bitmap.size(); ++i) {
if (bitmap[i]) {
- LOCAL_LOGGER.info(" {} still live", i * block_size);
+ LOCAL_LOGGER.info(" 0x{:x}B still live", i * block_size);
}
}
}
@@ -847,7 +849,7 @@ void SpaceTrackerDetailed::dump_usage(segment_id_t id) const
void SpaceTrackerSimple::dump_usage(segment_id_t id) const
{
LOG_PREFIX(SpaceTrackerSimple::dump_usage);
- INFO("id: {}, live_bytes: {}",
+ INFO("id: {}, live_bytes: 0x{:x}",
id, live_bytes_by_segment[id].live_bytes);
}
@@ -1125,6 +1127,7 @@ SegmentCleaner::do_reclaim_space(
return extent_callback->with_transaction_intr(
src,
"clean_reclaim_space",
+ CACHE_HINT_NOCACHE,
[this, &backref_extents, &pin_list, &reclaimed](auto &t)
{
return seastar::do_with(
@@ -1142,8 +1145,7 @@ SegmentCleaner::do_reclaim_space(
pin->get_key(),
pin->get_val(),
pin->get_length(),
- pin->get_type(),
- JOURNAL_SEQ_NULL);
+ pin->get_type());
}
for (auto &cached_backref : cached_backref_entries) {
if (cached_backref.laddr == L_ADDR_NULL) {
@@ -1165,7 +1167,7 @@ SegmentCleaner::do_reclaim_space(
[this, &extents, &t](auto &ent)
{
LOG_PREFIX(SegmentCleaner::do_reclaim_space);
- TRACET("getting extent of type {} at {}~{}",
+ TRACET("getting extent of type {} at {}~0x{:x}",
t,
ent.type,
ent.paddr,
@@ -1241,6 +1243,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
return extent_callback->with_transaction_intr(
Transaction::src_t::READ,
"retrieve_from_backref_tree",
+ CACHE_HINT_NOCACHE,
[this, &weak_read_ret](auto &t) {
return backref_manager.get_mappings(
t,
@@ -1507,6 +1510,7 @@ bool SegmentCleaner::check_usage()
SpaceTrackerIRef tracker(space_tracker->make_empty());
extent_callback->with_transaction_weak(
"check_usage",
+ CACHE_HINT_NOCACHE,
[this, &tracker](auto &t) {
return backref_manager.scan_mapped_space(
t,
@@ -1568,7 +1572,7 @@ void SegmentCleaner::mark_space_used(
background_callback->maybe_wake_background();
assert(ret > 0);
- DEBUG("segment {} new len: {}~{}, live_bytes: {}",
+ DEBUG("segment {} new len: {}~0x{:x}, live_bytes: 0x{:x}",
seg_addr.get_segment_id(),
addr,
len,
@@ -1591,7 +1595,7 @@ void SegmentCleaner::mark_space_free(
stats.used_bytes -= len;
auto& seg_addr = addr.as_seg_paddr();
- DEBUG("segment {} free len: {}~{}",
+ DEBUG("segment {} free len: {}~0x{:x}",
seg_addr.get_segment_id(), addr, len);
auto old_usage = calc_utilization(seg_addr.get_segment_id());
[[maybe_unused]] auto ret = space_tracker->release(
@@ -1602,7 +1606,7 @@ void SegmentCleaner::mark_space_free(
adjust_segment_util(old_usage, new_usage);
background_callback->maybe_wake_blocked_io();
assert(ret >= 0);
- DEBUG("segment {} free len: {}~{}, live_bytes: {}",
+ DEBUG("segment {} free len: {}~0x{:x}, live_bytes: 0x{:x}",
seg_addr.get_segment_id(),
addr,
len,
@@ -1687,11 +1691,11 @@ void SegmentCleaner::print(std::ostream &os, bool is_detailed) const
<< ", reclaim_ratio=" << get_reclaim_ratio()
<< ", alive_ratio=" << get_alive_ratio();
if (is_detailed) {
- os << ", unavailable_unreclaimable="
+ os << ", unavailable_unreclaimable=0x" << std::hex
<< get_unavailable_unreclaimable_bytes() << "B"
- << ", unavailable_reclaimble="
+ << ", unavailable_reclaimble=0x"
<< get_unavailable_reclaimable_bytes() << "B"
- << ", alive=" << stats.used_bytes << "B"
+ << ", alive=0x" << stats.used_bytes << "B" << std::dec
<< ", " << segments;
}
os << ")";
@@ -1722,7 +1726,7 @@ void RBMCleaner::mark_space_used(
for (auto rbm : rbms) {
if (addr.get_device_id() == rbm->get_device_id()) {
if (rbm->get_start() <= addr) {
- DEBUG("allocate addr: {} len: {}", addr, len);
+ DEBUG("allocate addr: {} len: 0x{:x}", addr, len);
stats.used_bytes += len;
rbm->mark_space_used(addr, len);
}
@@ -1741,7 +1745,7 @@ void RBMCleaner::mark_space_free(
for (auto rbm : rbms) {
if (addr.get_device_id() == rbm->get_device_id()) {
if (rbm->get_start() <= addr) {
- DEBUG("free addr: {} len: {}", addr, len);
+ DEBUG("free addr: {} len: 0x{:x}", addr, len);
ceph_assert(stats.used_bytes >= len);
stats.used_bytes -= len;
rbm->mark_space_free(addr, len);
@@ -1813,6 +1817,7 @@ bool RBMCleaner::check_usage()
RBMSpaceTracker tracker(rbms);
extent_callback->with_transaction_weak(
"check_usage",
+ CACHE_HINT_NOCACHE,
[this, &tracker, &rbms](auto &t) {
return backref_manager.scan_mapped_space(
t,
diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h
index 424247c5bdc..1cef771aeb8 100644
--- a/src/crimson/os/seastore/async_cleaner.h
+++ b/src/crimson/os/seastore/async_cleaner.h
@@ -17,6 +17,7 @@
#include "crimson/os/seastore/randomblock_manager_group.h"
#include "crimson/os/seastore/transaction.h"
#include "crimson/os/seastore/segment_seq_allocator.h"
+#include "crimson/os/seastore/backref_mapping.h"
namespace crimson::os::seastore {
@@ -299,24 +300,29 @@ public:
/// Creates empty transaction
/// weak transaction should be type READ
virtual TransactionRef create_transaction(
- Transaction::src_t, const char *name, bool is_weak=false) = 0;
+ Transaction::src_t,
+ const char *name,
+ cache_hint_t cache_hint = CACHE_HINT_TOUCH,
+ bool is_weak=false) = 0;
/// Creates empty transaction with interruptible context
template <typename Func>
auto with_transaction_intr(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint,
Func &&f) {
return do_with_transaction_intr<Func, false>(
- src, name, std::forward<Func>(f));
+ src, name, cache_hint, std::forward<Func>(f));
}
template <typename Func>
auto with_transaction_weak(
const char* name,
+ cache_hint_t cache_hint,
Func &&f) {
return do_with_transaction_intr<Func, true>(
- Transaction::src_t::READ, name, std::forward<Func>(f)
+ Transaction::src_t::READ, name, cache_hint, std::forward<Func>(f)
).handle_error(
crimson::ct_error::eagain::assert_failure{"unexpected eagain"},
crimson::ct_error::pass_further_all{}
@@ -385,9 +391,10 @@ private:
auto do_with_transaction_intr(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint,
Func &&f) {
return seastar::do_with(
- create_transaction(src, name, IsWeak),
+ create_transaction(src, name, cache_hint, IsWeak),
[f=std::forward<Func>(f)](auto &ref_t) mutable {
return with_trans_intr(
*ref_t,
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc
index f89698d602a..9cbf65f4033 100644
--- a/src/crimson/os/seastore/backref/btree_backref_manager.cc
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc
@@ -28,28 +28,22 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
ceph_assert(backref_root->is_initial_pending()
== root_block->is_pending());
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, backref_root)};
} else if (root_block->is_pending()) {
auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
backref_root = prior.backref_root_node;
if (backref_root) {
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, backref_root)};
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
}
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h
index 38084bb00e6..24897dd55da 100644
--- a/src/crimson/os/seastore/backref/btree_backref_manager.h
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.h
@@ -9,44 +9,28 @@
namespace crimson::os::seastore::backref {
-constexpr size_t BACKREF_BLOCK_SIZE = 4096;
-
-class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> {
- extent_types_t type;
+class BtreeBackrefMapping : public BackrefMapping {
public:
BtreeBackrefMapping(op_context_t<paddr_t> ctx)
- : BtreeNodeMapping(ctx) {}
+ : BackrefMapping(ctx) {}
BtreeBackrefMapping(
op_context_t<paddr_t> ctx,
CachedExtentRef parent,
uint16_t pos,
backref_map_val_t &val,
backref_node_meta_t &&meta)
- : BtreeNodeMapping(
+ : BackrefMapping(
+ val.type,
ctx,
parent,
pos,
val.laddr,
val.len,
- std::forward<backref_node_meta_t>(meta)),
- type(val.type)
- {}
- extent_types_t get_type() const final {
- return type;
- }
-
- bool is_clone() const final {
- return false;
- }
-
-protected:
- std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate(
- op_context_t<paddr_t> ctx) const final {
- return std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>>(
- new BtreeBackrefMapping(ctx));
- }
+ std::forward<backref_node_meta_t>(meta)) {}
};
+constexpr size_t BACKREF_BLOCK_SIZE = 4096;
+
using BackrefBtree = FixedKVBtree<
paddr_t, backref_map_val_t, BackrefInternalNode,
BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>;
diff --git a/src/crimson/os/seastore/backref_entry.h b/src/crimson/os/seastore/backref_entry.h
new file mode 100644
index 00000000000..5f9becc9565
--- /dev/null
+++ b/src/crimson/os/seastore/backref_entry.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <iostream>
+
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+
+#include <boost/intrusive/set.hpp>
+
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+struct backref_entry_t {
+ using ref_t = std::unique_ptr<backref_entry_t>;
+
+ backref_entry_t(
+ const paddr_t& paddr,
+ const laddr_t& laddr,
+ extent_len_t len,
+ extent_types_t type)
+ : paddr(paddr),
+ laddr(laddr),
+ len(len),
+ type(type) {
+ assert(len > 0);
+ }
+ paddr_t paddr = P_ADDR_NULL;
+ laddr_t laddr = L_ADDR_NULL;
+ extent_len_t len = 0;
+ extent_types_t type = extent_types_t::NONE;
+ friend bool operator< (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr < r.paddr;
+ }
+ friend bool operator> (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr > r.paddr;
+ }
+ friend bool operator== (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr == r.paddr;
+ }
+
+ using set_hook_t =
+ boost::intrusive::set_member_hook<
+ boost::intrusive::link_mode<
+ boost::intrusive::auto_unlink>>;
+ set_hook_t backref_set_hook;
+ using backref_set_member_options = boost::intrusive::member_hook<
+ backref_entry_t,
+ set_hook_t,
+ &backref_entry_t::backref_set_hook>;
+ using multiset_t = boost::intrusive::multiset<
+ backref_entry_t,
+ backref_set_member_options,
+ boost::intrusive::constant_time_size<false>>;
+
+ struct cmp_t {
+ using is_transparent = paddr_t;
+ bool operator()(
+ const backref_entry_t &l,
+ const backref_entry_t &r) const {
+ return l.paddr < r.paddr;
+ }
+ bool operator()(const paddr_t l, const backref_entry_t &r) const {
+ return l < r.paddr;
+ }
+ bool operator()(const backref_entry_t &l, const paddr_t r) const {
+ return l.paddr < r;
+ }
+ };
+
+ static ref_t create_alloc(
+ const paddr_t& paddr,
+ const laddr_t& laddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type));
+ assert(laddr != L_ADDR_NULL);
+ return std::make_unique<backref_entry_t>(
+ paddr, laddr, len, type);
+ }
+
+ static ref_t create_retire(
+ const paddr_t& paddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type) ||
+ is_retired_placeholder_type(type));
+ return std::make_unique<backref_entry_t>(
+ paddr, L_ADDR_NULL, len, type);
+ }
+
+ static ref_t create(const alloc_blk_t& delta) {
+ return std::make_unique<backref_entry_t>(
+ delta.paddr, delta.laddr, delta.len, delta.type);
+ }
+};
+
+inline std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) {
+ return out << "backref_entry_t{"
+ << ent.paddr << "~0x" << std::hex << ent.len << std::dec << ", "
+ << "laddr: " << ent.laddr << ", "
+ << "type: " << ent.type
+ << "}";
+}
+
+using backref_entry_ref = backref_entry_t::ref_t;
+using backref_entry_mset_t = backref_entry_t::multiset_t;
+using backref_entry_refs_t = std::vector<backref_entry_ref>;
+using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>;
+using backref_entry_query_set_t = std::set<backref_entry_t, backref_entry_t::cmp_t>;
+
+} // namespace crimson::os::seastore
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::backref_entry_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h
index 3feedb997b4..8c746b571b2 100644
--- a/src/crimson/os/seastore/backref_manager.h
+++ b/src/crimson/os/seastore/backref_manager.h
@@ -6,6 +6,7 @@
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/backref_mapping.h"
namespace crimson::os::seastore {
diff --git a/src/crimson/os/seastore/backref_mapping.h b/src/crimson/os/seastore/backref_mapping.h
new file mode 100644
index 00000000000..d0a6a0ea6ff
--- /dev/null
+++ b/src/crimson/os/seastore/backref_mapping.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore {
+
+class BackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> {
+ extent_types_t type;
+public:
+ BackrefMapping(op_context_t<paddr_t> ctx)
+ : BtreeNodeMapping(ctx) {}
+ template <typename... T>
+ BackrefMapping(extent_types_t type, T&&... t)
+ : BtreeNodeMapping(std::forward<T>(t)...),
+ type(type) {}
+ extent_types_t get_type() const {
+ return type;
+ }
+};
+
+using BackrefMappingRef = std::unique_ptr<BackrefMapping>;
+using backref_pin_list_t = std::list<BackrefMappingRef>;
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc
deleted file mode 100644
index f0d507a24c4..00000000000
--- a/src/crimson/os/seastore/btree/btree_range_pin.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "crimson/os/seastore/btree/btree_range_pin.h"
-#include "crimson/os/seastore/btree/fixed_kv_node.h"
-
-namespace crimson::os::seastore {
-
-template <typename key_t, typename val_t>
-get_child_ret_t<LogicalCachedExtent>
-BtreeNodeMapping<key_t, val_t>::get_logical_extent(
- Transaction &t)
-{
- ceph_assert(is_parent_viewable());
- assert(pos != std::numeric_limits<uint16_t>::max());
- ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id());
- auto &p = (FixedKVNode<key_t>&)*parent;
- auto k = this->is_indirect()
- ? this->get_intermediate_base()
- : get_key();
- auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k);
- if (!v.has_child()) {
- this->child_pos = v.get_child_pos();
- }
- return v;
-}
-
-template <typename key_t, typename val_t>
-bool BtreeNodeMapping<key_t, val_t>::is_stable() const
-{
- assert(!this->parent_modified());
- assert(pos != std::numeric_limits<uint16_t>::max());
- auto &p = (FixedKVNode<key_t>&)*parent;
- auto k = this->is_indirect()
- ? this->get_intermediate_base()
- : get_key();
- return p.is_child_stable(ctx, pos, k);
-}
-
-template <typename key_t, typename val_t>
-bool BtreeNodeMapping<key_t, val_t>::is_data_stable() const
-{
- assert(!this->parent_modified());
- assert(pos != std::numeric_limits<uint16_t>::max());
- auto &p = (FixedKVNode<key_t>&)*parent;
- auto k = this->is_indirect()
- ? this->get_intermediate_base()
- : get_key();
- return p.is_child_data_stable(ctx, pos, k);
-}
-
-template class BtreeNodeMapping<laddr_t, paddr_t>;
-template class BtreeNodeMapping<paddr_t, laddr_t>;
-} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h
index 91751801e5d..bfd350a8bed 100644
--- a/src/crimson/os/seastore/btree/btree_range_pin.h
+++ b/src/crimson/os/seastore/btree/btree_range_pin.h
@@ -7,11 +7,12 @@
#include "crimson/common/log.h"
-#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
namespace crimson::os::seastore {
+class Cache;
template <typename node_key_t>
struct op_context_t {
@@ -116,8 +117,6 @@ protected:
extent_len_t len = 0;
fixed_kv_node_meta_t<key_t> range;
uint16_t pos = std::numeric_limits<uint16_t>::max();
-
- virtual std::unique_ptr<BtreeNodeMapping> _duplicate(op_context_t<key_t>) const = 0;
fixed_kv_node_meta_t<key_t> _get_pin_range() const {
return range;
}
@@ -139,11 +138,7 @@ public:
len(len),
range(meta),
pos(pos)
- {
- if (!parent->is_pending()) {
- this->child_pos = {parent, pos};
- }
- }
+ {}
CachedExtentRef get_parent() const final {
return parent;
@@ -162,11 +157,6 @@ public:
return len;
}
- extent_types_t get_type() const override {
- ceph_abort("should never happen");
- return extent_types_t::ROOT;
- }
-
val_t get_val() const final {
if constexpr (std::is_same_v<val_t, paddr_t>) {
return value.get_paddr();
@@ -180,16 +170,6 @@ public:
return range.begin;
}
- PhysicalNodeMappingRef<key_t, val_t> duplicate() const final {
- auto ret = _duplicate(ctx);
- ret->range = range;
- ret->value = value;
- ret->parent = parent;
- ret->len = len;
- ret->pos = pos;
- return ret;
- }
-
bool has_been_invalidated() const final {
return parent->has_been_invalidated();
}
@@ -215,9 +195,6 @@ public:
return unviewable;
}
- get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final;
- bool is_stable() const final;
- bool is_data_stable() const final;
bool is_parent_viewable() const final {
ceph_assert(parent);
if (!parent->is_valid()) {
diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h
index cb4fff32750..04ebcc7e2ca 100644
--- a/src/crimson/os/seastore/btree/fixed_kv_btree.h
+++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h
@@ -32,10 +32,6 @@ inline ChildableCachedExtent* get_reserved_ptr() {
template <typename T>
phy_tree_root_t& get_phy_tree_root(root_t& r);
-using get_child_iertr =
- ::crimson::interruptible::interruptible_errorator<
- typename trans_intr::condition,
- get_child_ertr>;
using get_phy_tree_root_node_ret =
std::pair<bool, get_child_iertr::future<CachedExtentRef>>;
@@ -1501,7 +1497,7 @@ private:
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return trans_intr::make_interruptible(std::move(v.get_child_fut())
+ return std::move(v.get_child_fut()
).si_then([on_found=std::move(on_found), node_iter, c,
parent_entry](auto child) {
LOG_PREFIX(FixedKVBtree::lookup_internal_level);
@@ -1571,7 +1567,7 @@ private:
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return trans_intr::make_interruptible(std::move(v.get_child_fut())
+ return std::move(v.get_child_fut()
).si_then([on_found=std::move(on_found), node_iter, c,
parent_entry](auto child) {
LOG_PREFIX(FixedKVBtree::lookup_leaf);
@@ -2126,7 +2122,7 @@ private:
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return trans_intr::make_interruptible(std::move(v.get_child_fut())
+ return std::move(v.get_child_fut()
).si_then([do_merge=std::move(do_merge), &pos,
donor_iter, donor_is_left, c, parent_pos](auto child) {
LOG_PREFIX(FixedKVBtree::merge_level);
diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h
index 09f54a4f2d0..63e2ca38c42 100644
--- a/src/crimson/os/seastore/btree/fixed_kv_node.h
+++ b/src/crimson/os/seastore/btree/fixed_kv_node.h
@@ -165,6 +165,11 @@ struct FixedKVNode : ChildableCachedExtent {
: ChildableCachedExtent(std::move(ptr)),
children(capacity, nullptr),
capacity(capacity) {}
+ // Must be identical with FixedKVNode(capacity, ptr) after on_fully_loaded()
+ explicit FixedKVNode(uint16_t capacity, extent_len_t length)
+ : ChildableCachedExtent(length),
+ children(capacity, nullptr),
+ capacity(capacity) {}
FixedKVNode(const FixedKVNode &rhs)
: ChildableCachedExtent(rhs),
range(rhs.range),
@@ -708,12 +713,17 @@ struct FixedKVInternalNode
node_size,
node_type_t>;
- FixedKVInternalNode(ceph::bufferptr &&ptr)
- : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)),
- node_layout_t(this->get_bptr().c_str()) {}
+ explicit FixedKVInternalNode(ceph::bufferptr &&ptr)
+ : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with FixedKVInternalNode(ptr) after on_fully_loaded()
+ explicit FixedKVInternalNode(extent_len_t length)
+ : FixedKVNode<NODE_KEY>(CAPACITY, length) {}
FixedKVInternalNode(const FixedKVInternalNode &rhs)
- : FixedKVNode<NODE_KEY>(rhs),
- node_layout_t(this->get_bptr().c_str()) {}
+ : FixedKVNode<NODE_KEY>(rhs) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
bool have_children() const final {
return true;
@@ -985,6 +995,10 @@ struct FixedKVInternalNode
pivot);
}
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
/**
* Internal relative addresses on read or in memory prior to commit
* are either record or block relative depending on whether this
@@ -994,8 +1008,7 @@ struct FixedKVInternalNode
* resolve_relative_addrs fixes up relative internal references
* based on base.
*/
- void resolve_relative_addrs(paddr_t base)
- {
+ void resolve_relative_addrs(paddr_t base) final {
LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs);
for (auto i: *this) {
if (i->get_val().is_relative()) {
@@ -1122,13 +1135,18 @@ struct FixedKVLeafNode
node_type_t,
has_children>;
using base_t = FixedKVNode<NODE_KEY>;
- FixedKVLeafNode(ceph::bufferptr &&ptr)
- : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)),
- node_layout_t(this->get_bptr().c_str()) {}
+ explicit FixedKVLeafNode(ceph::bufferptr &&ptr)
+ : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with FixedKVLeafNode(ptr) after on_fully_loaded()
+ explicit FixedKVLeafNode(extent_len_t length)
+ : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, length) {}
FixedKVLeafNode(const FixedKVLeafNode &rhs)
: FixedKVNode<NODE_KEY>(rhs),
- node_layout_t(this->get_bptr().c_str()),
- modifications(rhs.modifications) {}
+ modifications(rhs.modifications) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
static constexpr bool do_has_children = has_children;
// for the stable extent, modifications is always 0;
@@ -1235,6 +1253,10 @@ struct FixedKVLeafNode
}
}
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
void prepare_commit() final {
if constexpr (has_children) {
if (this->is_initial_pending()) {
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 5dcb7514ee1..86f816e1648 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -28,15 +28,6 @@ SET_SUBSYS(seastore_cache);
namespace crimson::os::seastore {
-std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) {
- return out << "backref_entry_t{"
- << ent.paddr << "~" << ent.len << ", "
- << "laddr: " << ent.laddr << ", "
- << "type: " << ent.type << ", "
- << "seq: " << ent.seq << ", "
- << "}";
-}
-
Cache::Cache(
ExtentPlacementManager &epm)
: epm(epm),
@@ -44,7 +35,7 @@ Cache::Cache(
"seastore_cache_lru_size"))
{
LOG_PREFIX(Cache::Cache);
- INFO("created, lru_capacity={}B", lru.get_capacity_bytes());
+ INFO("created, lru_capacity=0x{:x}B", lru.get_capacity_bytes());
register_metrics();
segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr);
}
@@ -63,18 +54,18 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
Transaction &t, paddr_t addr, extent_len_t length)
{
LOG_PREFIX(Cache::retire_extent_addr);
- TRACET("retire {}~{}", t, addr, length);
+ TRACET("retire {}~0x{:x}", t, addr, length);
assert(addr.is_real() && !addr.is_block_relative());
CachedExtentRef ext;
auto result = t.get_extent(addr, &ext);
if (result == Transaction::get_extent_ret::PRESENT) {
- DEBUGT("retire {}~{} on t -- {}", t, addr, length, *ext);
+ DEBUGT("retire {}~0x{:x} on t -- {}", t, addr, length, *ext);
t.add_to_retired_set(CachedExtentRef(&*ext));
return retire_extent_iertr::now();
} else if (result == Transaction::get_extent_ret::RETIRED) {
- ERRORT("retire {}~{} failed, already retired -- {}", t, addr, length, *ext);
+ ERRORT("retire {}~0x{:x} failed, already retired -- {}", t, addr, length, *ext);
ceph_abort();
}
@@ -85,7 +76,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
// retiring is not included by the cache hit metrics
ext = query_cache(addr);
if (ext) {
- DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext);
+ DEBUGT("retire {}~0x{:x} in cache -- {}", t, addr, length, *ext);
} else {
// add a new placeholder to Cache
ext = CachedExtent::make_cached_extent_ref<
@@ -95,7 +86,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
- DEBUGT("retire {}~{} as placeholder, add extent -- {}",
+ DEBUGT("retire {}~0x{:x} as placeholder, add extent -- {}",
t, addr, length, *ext);
add_extent(ext);
}
@@ -123,7 +114,7 @@ void Cache::retire_absent_extent_addr(
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
- DEBUGT("retire {}~{} as placeholder, add extent -- {}",
+ DEBUGT("retire {}~0x{:x} as placeholder, add extent -- {}",
t, addr, length, *ext);
add_extent(ext);
t.add_to_read_set(ext);
@@ -172,6 +163,7 @@ void Cache::register_metrics()
{extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")},
{extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")},
{extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")},
+ {extent_types_t::ROOT_META, sm::label_instance("ext", "ROOT_META")},
{extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")},
{extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")},
{extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
@@ -1081,7 +1073,7 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
)
{
LOG_PREFIX(Cache::alloc_new_extent_by_type);
- SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBDEBUGT(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, type, length, hint, rewrite_gen_printer_t{gen});
ceph_assert(get_extent_category(type) == data_category_t::METADATA);
switch (type) {
@@ -1093,6 +1085,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
case extent_types_t::LADDR_LEAF:
return alloc_new_non_data_extent<lba_manager::btree::LBALeafNode>(
t, length, hint, gen);
+ case extent_types_t::ROOT_META:
+ return alloc_new_non_data_extent<RootMetaBlock>(
+ t, length, hint, gen);
case extent_types_t::ONODE_BLOCK_STAGED:
return alloc_new_non_data_extent<onode::SeastoreNodeExtent>(
t, length, hint, gen);
@@ -1129,7 +1124,7 @@ std::vector<CachedExtentRef> Cache::alloc_new_data_extents_by_type(
)
{
LOG_PREFIX(Cache::alloc_new_data_extents_by_type);
- SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBDEBUGT(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, type, length, hint, rewrite_gen_printer_t{gen});
ceph_assert(get_extent_category(type) == data_category_t::DATA);
std::vector<CachedExtentRef> res;
@@ -1344,21 +1339,39 @@ record_t Cache::prepare_record(
io_stat_t retire_stat;
std::vector<alloc_delta_t> alloc_deltas;
alloc_delta_t rel_delta;
+ backref_entry_refs_t backref_entries;
rel_delta.op = alloc_delta_t::op_types_t::CLEAR;
for (auto &i: t.retired_set) {
auto &extent = i.extent;
get_by_ext(efforts.retire_by_ext,
extent->get_type()).increment(extent->get_length());
retire_stat.increment(extent->get_length());
- DEBUGT("retired and remove extent -- {}", t, *extent);
+ DEBUGT("retired and remove extent {}~0x{:x} -- {}",
+ t, extent->get_paddr(), extent->get_length(), *extent);
commit_retire_extent(t, extent);
- if (is_backref_mapped_extent_node(extent) ||
- is_retired_placeholder_type(extent->get_type())) {
+
+ // Note: commit extents and backref allocations in the same place
+ if (is_backref_mapped_type(extent->get_type()) ||
+ is_retired_placeholder_type(extent->get_type())) {
+ DEBUGT("backref_entry free {}~0x{:x}",
+ t,
+ extent->get_paddr(),
+ extent->get_length());
rel_delta.alloc_blk_ranges.emplace_back(
- extent->get_paddr(),
- L_ADDR_NULL,
- extent->get_length(),
- extent->get_type());
+ alloc_blk_t::create_retire(
+ extent->get_paddr(),
+ extent->get_length(),
+ extent->get_type()));
+ backref_entries.emplace_back(
+ backref_entry_t::create_retire(
+ extent->get_paddr(),
+ extent->get_length(),
+ extent->get_type()));
+ } else if (is_backref_node(extent->get_type())) {
+ remove_backref_extent(extent->get_paddr());
+ } else {
+ ERRORT("Got unexpected extent type: {}", t, *extent);
+ ceph_abort("imposible");
}
}
alloc_deltas.emplace_back(std::move(rel_delta));
@@ -1395,27 +1408,40 @@ record_t Cache::prepare_record(
if (modify_time == NULL_TIME) {
modify_time = commit_time;
}
+ laddr_t fresh_laddr;
+ if (i->is_logical()) {
+ fresh_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else if (is_lba_node(i->get_type())) {
+ fresh_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ } else {
+ fresh_laddr = L_ADDR_NULL;
+ }
record.push_back(extent_t{
i->get_type(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : (is_lba_node(i->get_type())
- ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
- : L_ADDR_NULL),
+ fresh_laddr,
std::move(bl)
},
modify_time);
- if (i->is_valid()
- && is_backref_mapped_extent_node(i)) {
+
+ if (!i->is_valid()) {
+ continue;
+ }
+ if (is_backref_mapped_type(i->get_type())) {
+ laddr_t alloc_laddr;
+ if (i->is_logical()) {
+ alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else if (is_lba_node(i->get_type())) {
+ alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ } else {
+ assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL);
+ alloc_laddr = L_ADDR_MIN;
+ }
alloc_delta.alloc_blk_ranges.emplace_back(
- i->get_paddr(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : (is_lba_node(i->get_type())
- ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
- : L_ADDR_NULL),
- i->get_length(),
- i->get_type());
+ alloc_blk_t::create_alloc(
+ i->get_paddr(),
+ alloc_laddr,
+ i->get_length(),
+ i->get_type()));
}
}
@@ -1426,14 +1452,20 @@ record_t Cache::prepare_record(
get_by_ext(efforts.fresh_ool_by_ext,
i->get_type()).increment(i->get_length());
i->prepare_commit();
- if (is_backref_mapped_extent_node(i)) {
+ if (is_backref_mapped_type(i->get_type())) {
+ laddr_t alloc_laddr;
+ if (i->is_logical()) {
+ alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else {
+ assert(is_lba_node(i->get_type()));
+ alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ }
alloc_delta.alloc_blk_ranges.emplace_back(
- i->get_paddr(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin,
- i->get_length(),
- i->get_type());
+ alloc_blk_t::create_alloc(
+ i->get_paddr(),
+ alloc_laddr,
+ i->get_length(),
+ i->get_type()));
}
}
@@ -1451,19 +1483,57 @@ record_t Cache::prepare_record(
i->state = CachedExtent::extent_state_t::CLEAN;
assert(i->is_logical());
i->clear_modified_region();
- touch_extent(*i, &trans_src);
+ touch_extent(*i, &trans_src, t.get_cache_hint());
DEBUGT("inplace rewrite ool block is commmitted -- {}", t, *i);
}
+ auto existing_stats = t.get_existing_block_stats();
+ DEBUGT("total existing blocks num: {}, exist clean num: {}, "
+ "exist mutation pending num: {}",
+ t,
+ existing_stats.valid_num,
+ existing_stats.clean_num,
+ existing_stats.mutated_num);
for (auto &i: t.existing_block_list) {
- if (i->is_valid()) {
- alloc_delta.alloc_blk_ranges.emplace_back(
- i->get_paddr(),
+ assert(is_logical_type(i->get_type()));
+ if (!i->is_valid()) {
+ continue;
+ }
+
+ if (i->is_exist_clean()) {
+ i->state = CachedExtent::extent_state_t::CLEAN;
+ } else {
+ assert(i->is_exist_mutation_pending());
+ // i->state must become DIRTY in complete_commit()
+ }
+
+ // exist mutation pending extents must be in t.mutated_block_list
+ add_extent(i);
+ const auto t_src = t.get_src();
+ if (i->is_dirty()) {
+ add_to_dirty(i, &t_src);
+ } else {
+ touch_extent(*i, &t_src, t.get_cache_hint());
+ }
+
+ alloc_delta.alloc_blk_ranges.emplace_back(
+ alloc_blk_t::create_alloc(
+ i->get_paddr(),
i->cast<LogicalCachedExtent>()->get_laddr(),
i->get_length(),
- i->get_type());
- }
+ i->get_type()));
+
+ // Note: commit extents and backref allocations in the same place
+ // Note: remapping is split into 2 steps, retire and alloc, they must be
+ // committed atomically together
+ backref_entries.emplace_back(
+ backref_entry_t::create_alloc(
+ i->get_paddr(),
+ i->cast<LogicalCachedExtent>()->get_laddr(),
+ i->get_length(),
+ i->get_type()));
}
+
alloc_deltas.emplace_back(std::move(alloc_delta));
for (auto b : alloc_deltas) {
@@ -1517,6 +1587,9 @@ record_t Cache::prepare_record(
record.push_back(std::move(delta));
}
+ apply_backref_mset(backref_entries);
+ t.set_backref_entries(std::move(backref_entries));
+
ceph_assert(t.get_fresh_block_stats().num ==
t.inline_block_list.size() +
t.ool_block_list.size() +
@@ -1616,26 +1689,35 @@ record_t Cache::prepare_record(
return record;
}
-void Cache::backref_batch_update(
- std::vector<backref_entry_ref> &&list,
- const journal_seq_t &seq)
+void Cache::apply_backref_byseq(
+ backref_entry_refs_t&& backref_entries,
+ const journal_seq_t& seq)
{
- LOG_PREFIX(Cache::backref_batch_update);
- DEBUG("inserting {} entries at {}", list.size(), seq);
- ceph_assert(seq != JOURNAL_SEQ_NULL);
-
- for (auto &ent : list) {
- backref_entry_mset.insert(*ent);
+ LOG_PREFIX(Cache::apply_backref_byseq);
+ DEBUG("backref_entry apply {} entries at {}",
+ backref_entries.size(), seq);
+ assert(seq != JOURNAL_SEQ_NULL);
+ if (backref_entries.empty()) {
+ return;
}
-
- auto iter = backref_entryrefs_by_seq.find(seq);
- if (iter == backref_entryrefs_by_seq.end()) {
- backref_entryrefs_by_seq.emplace(seq, std::move(list));
+ if (backref_entryrefs_by_seq.empty()) {
+ backref_entryrefs_by_seq.insert(
+ backref_entryrefs_by_seq.end(),
+ {seq, std::move(backref_entries)});
+ return;
+ }
+ auto last = backref_entryrefs_by_seq.rbegin();
+ assert(last->first <= seq);
+ if (last->first == seq) {
+ last->second.insert(
+ last->second.end(),
+ std::make_move_iterator(backref_entries.begin()),
+ std::make_move_iterator(backref_entries.end()));
} else {
- iter->second.insert(
- iter->second.end(),
- std::make_move_iterator(list.begin()),
- std::make_move_iterator(list.end()));
+ assert(last->first < seq);
+ backref_entryrefs_by_seq.insert(
+ backref_entryrefs_by_seq.end(),
+ {seq, std::move(backref_entries)});
}
}
@@ -1648,7 +1730,7 @@ void Cache::complete_commit(
SUBTRACET(seastore_t, "final_block_start={}, start_seq={}",
t, final_block_start, start_seq);
- std::vector<backref_entry_ref> backref_list;
+ backref_entry_refs_t backref_entries;
t.for_each_finalized_fresh_block([&](const CachedExtentRef &i) {
if (!i->is_valid()) {
return;
@@ -1677,24 +1759,30 @@ void Cache::complete_commit(
add_extent(i);
assert(!i->is_dirty());
const auto t_src = t.get_src();
- touch_extent(*i, &t_src);
+ touch_extent(*i, &t_src, t.get_cache_hint());
epm.commit_space_used(i->get_paddr(), i->get_length());
- if (is_backref_mapped_extent_node(i)) {
- DEBUGT("backref_list new {} len {}",
+
+ // Note: commit extents and backref allocations in the same place
+ if (is_backref_mapped_type(i->get_type())) {
+ DEBUGT("backref_entry alloc {}~0x{:x}",
t,
i->get_paddr(),
i->get_length());
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
+ laddr_t alloc_laddr;
+ if (i->is_logical()) {
+ alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else if (is_lba_node(i->get_type())) {
+ alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ } else {
+ assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL);
+ alloc_laddr = L_ADDR_MIN;
+ }
+ backref_entries.emplace_back(
+ backref_entry_t::create_alloc(
i->get_paddr(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : (is_lba_node(i->get_type())
- ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
- : L_ADDR_NULL),
+ alloc_laddr,
i->get_length(),
- i->get_type(),
- start_seq));
+ i->get_type()));
} else if (is_backref_node(i->get_type())) {
add_backref_extent(
i->get_paddr(),
@@ -1731,9 +1819,10 @@ void Cache::complete_commit(
epm.mark_space_free(extent->get_paddr(), extent->get_length());
}
for (auto &i: t.existing_block_list) {
- if (i->is_valid()) {
- epm.mark_space_used(i->get_paddr(), i->get_length());
+ if (!i->is_valid()) {
+ continue;
}
+ epm.mark_space_used(i->get_paddr(), i->get_length());
}
for (auto &i: t.mutated_block_list) {
@@ -1747,64 +1836,10 @@ void Cache::complete_commit(
for (auto &i: t.retired_set) {
auto &extent = i.extent;
extent->dirty_from_or_retired_at = start_seq;
- if (is_backref_mapped_extent_node(extent) ||
- is_retired_placeholder_type(extent->get_type())) {
- DEBUGT("backref_list free {} len {}",
- t,
- extent->get_paddr(),
- extent->get_length());
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
- extent->get_paddr(),
- L_ADDR_NULL,
- extent->get_length(),
- extent->get_type(),
- start_seq));
- } else if (is_backref_node(extent->get_type())) {
- remove_backref_extent(extent->get_paddr());
- } else {
- ERRORT("{}", t, *extent);
- ceph_abort("not possible");
- }
}
- auto existing_stats = t.get_existing_block_stats();
- DEBUGT("total existing blocks num: {}, exist clean num: {}, "
- "exist mutation pending num: {}",
- t,
- existing_stats.valid_num,
- existing_stats.clean_num,
- existing_stats.mutated_num);
- for (auto &i: t.existing_block_list) {
- if (i->is_valid()) {
- if (i->is_exist_clean()) {
- i->state = CachedExtent::extent_state_t::CLEAN;
- } else {
- assert(i->state == CachedExtent::extent_state_t::DIRTY);
- }
- DEBUGT("backref_list new existing {} len {}",
- t,
- i->get_paddr(),
- i->get_length());
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
- i->get_paddr(),
- i->cast<LogicalCachedExtent>()->get_laddr(),
- i->get_length(),
- i->get_type(),
- start_seq));
- add_extent(i);
- const auto t_src = t.get_src();
- if (i->is_dirty()) {
- add_to_dirty(i, &t_src);
- } else {
- touch_extent(*i, &t_src);
- }
- }
- }
- if (!backref_list.empty()) {
- backref_batch_update(std::move(backref_list), start_seq);
- }
+ apply_backref_byseq(t.move_backref_entries(), start_seq);
+ commit_backref_entries(std::move(backref_entries), start_seq);
for (auto &i: t.pre_alloc_list) {
if (!i->is_valid()) {
@@ -1822,7 +1857,7 @@ void Cache::init()
remove_extent(root, nullptr);
root = nullptr;
}
- root = new RootBlock();
+ root = CachedExtent::make_cached_extent_ref<RootBlock>();
root->init(CachedExtent::extent_state_t::CLEAN,
P_ADDR_ROOT,
PLACEMENT_HINT_NULL,
@@ -1927,25 +1962,18 @@ Cache::replay_delta(
alloc_delta_t alloc_delta;
decode(alloc_delta, delta.bl);
- std::vector<backref_entry_ref> backref_list;
+ backref_entry_refs_t backref_entries;
for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) {
if (alloc_blk.paddr.is_relative()) {
assert(alloc_blk.paddr.is_record_relative());
alloc_blk.paddr = record_base.add_relative(alloc_blk.paddr);
}
- DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}",
+ DEBUG("replay alloc_blk {}~0x{:x} {}, journal_seq: {}",
alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq);
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
- alloc_blk.paddr,
- alloc_blk.laddr,
- alloc_blk.len,
- alloc_blk.type,
- journal_seq));
- }
- if (!backref_list.empty()) {
- backref_batch_update(std::move(backref_list), journal_seq);
+ backref_entries.emplace_back(
+ backref_entry_t::create(alloc_blk));
}
+ commit_backref_entries(std::move(backref_entries), journal_seq);
return replay_delta_ertr::make_ready_future<std::pair<bool, CachedExtentRef>>(
std::make_pair(true, nullptr));
}
@@ -1998,8 +2026,9 @@ Cache::replay_delta(
[](CachedExtent &) {},
[this](CachedExtent &ext) {
// replay is not included by the cache hit metrics
- touch_extent(ext, nullptr);
- }) :
+ touch_extent(ext, nullptr, CACHE_HINT_TOUCH);
+ },
+ nullptr) :
_get_extent_if_cached(
delta.paddr)
).handle_error(
@@ -2162,7 +2191,8 @@ Cache::do_get_caching_extent_by_type(
laddr_t laddr,
extent_len_t length,
extent_init_func_t &&extent_init_func,
- extent_init_func_t &&on_cache)
+ extent_init_func_t &&on_cache,
+ const Transaction::src_t* p_src)
{
return [=, this, extent_init_func=std::move(extent_init_func)]() mutable {
switch (type) {
@@ -2171,55 +2201,61 @@ Cache::do_get_caching_extent_by_type(
return get_extent_ertr::make_ready_future<CachedExtentRef>();
case extent_types_t::BACKREF_INTERNAL:
return do_get_caching_extent<backref::BackrefInternalNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::BACKREF_LEAF:
return do_get_caching_extent<backref::BackrefLeafNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::LADDR_INTERNAL:
return do_get_caching_extent<lba_manager::btree::LBAInternalNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::LADDR_LEAF:
return do_get_caching_extent<lba_manager::btree::LBALeafNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
+ case extent_types_t::ROOT_META:
+ return do_get_caching_extent<RootMetaBlock>(
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
case extent_types_t::OMAP_INNER:
return do_get_caching_extent<omap_manager::OMapInnerNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OMAP_LEAF:
return do_get_caching_extent<omap_manager::OMapLeafNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::COLL_BLOCK:
return do_get_caching_extent<collection_manager::CollectionNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::ONODE_BLOCK_STAGED:
return do_get_caching_extent<onode::SeastoreNodeExtent>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OBJECT_DATA_BLOCK:
return do_get_caching_extent<ObjectDataBlock>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
@@ -2228,13 +2264,13 @@ Cache::do_get_caching_extent_by_type(
return get_extent_ertr::make_ready_future<CachedExtentRef>();
case extent_types_t::TEST_BLOCK:
return do_get_caching_extent<TestBlock>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::TEST_BLOCK_PHYSICAL:
return do_get_caching_extent<TestBlockPhysical>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index dba3610e95f..a239b861726 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -3,13 +3,13 @@
#pragma once
-#include <iostream>
-
#include "seastar/core/shared_future.hh"
#include "include/buffer.h"
#include "crimson/common/errorator.h"
+#include "crimson/common/errorator-loop.h"
+#include "crimson/os/seastore/backref_entry.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/extent_placement_manager.h"
#include "crimson/os/seastore/logging.h"
@@ -37,86 +37,6 @@ class FixedKVBtree;
class BackrefManager;
class SegmentProvider;
-struct backref_entry_t {
- backref_entry_t(
- const paddr_t paddr,
- const laddr_t laddr,
- const extent_len_t len,
- const extent_types_t type,
- const journal_seq_t seq)
- : paddr(paddr),
- laddr(laddr),
- len(len),
- type(type),
- seq(seq)
- {}
- backref_entry_t(alloc_blk_t alloc_blk)
- : paddr(alloc_blk.paddr),
- laddr(alloc_blk.laddr),
- len(alloc_blk.len),
- type(alloc_blk.type)
- {}
- paddr_t paddr = P_ADDR_NULL;
- laddr_t laddr = L_ADDR_NULL;
- extent_len_t len = 0;
- extent_types_t type =
- extent_types_t::ROOT;
- journal_seq_t seq;
- friend bool operator< (
- const backref_entry_t &l,
- const backref_entry_t &r) {
- return l.paddr < r.paddr;
- }
- friend bool operator> (
- const backref_entry_t &l,
- const backref_entry_t &r) {
- return l.paddr > r.paddr;
- }
- friend bool operator== (
- const backref_entry_t &l,
- const backref_entry_t &r) {
- return l.paddr == r.paddr;
- }
-
- using set_hook_t =
- boost::intrusive::set_member_hook<
- boost::intrusive::link_mode<
- boost::intrusive::auto_unlink>>;
- set_hook_t backref_set_hook;
- using backref_set_member_options = boost::intrusive::member_hook<
- backref_entry_t,
- set_hook_t,
- &backref_entry_t::backref_set_hook>;
- using multiset_t = boost::intrusive::multiset<
- backref_entry_t,
- backref_set_member_options,
- boost::intrusive::constant_time_size<false>>;
-
- struct cmp_t {
- using is_transparent = paddr_t;
- bool operator()(
- const backref_entry_t &l,
- const backref_entry_t &r) const {
- return l.paddr < r.paddr;
- }
- bool operator()(const paddr_t l, const backref_entry_t &r) const {
- return l < r.paddr;
- }
- bool operator()(const backref_entry_t &l, const paddr_t r) const {
- return l.paddr < r;
- }
- };
-};
-
-std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent);
-
-using backref_entry_ref = std::unique_ptr<backref_entry_t>;
-using backref_entry_mset_t = backref_entry_t::multiset_t;
-using backref_entry_refs_t = std::vector<backref_entry_ref>;
-using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>;
-using backref_entry_query_set_t = std::set<
- backref_entry_t, backref_entry_t::cmp_t>;
-
/**
* Cache
*
@@ -204,6 +124,7 @@ public:
TransactionRef create_transaction(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint,
bool is_weak) {
LOG_PREFIX(Cache::create_transaction);
@@ -217,7 +138,8 @@ public:
[this](Transaction& t) {
return on_transaction_destruct(t);
},
- ++next_id
+ ++next_id,
+ cache_hint
);
SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}",
*ret, name, src, is_weak);
@@ -323,8 +245,9 @@ public:
CachedExtentRef>(ret);
});
} else {
- SUBDEBUGT(seastore_cache, "{} {} is present on t -- {}"
- " without being fully loaded", t, type, offset, *ret);
+ SUBDEBUGT(seastore_cache,
+ "{} {} is present on t -- {} without fully loaded",
+ t, type, offset, *ret);
return get_extent_if_cached_iertr::make_ready_future<
CachedExtentRef>();
}
@@ -354,8 +277,8 @@ public:
if (!ret->is_fully_loaded()) {
// ignore non-full extent
- SUBDEBUGT(seastore_cache, "{} {} is present without "
- "being fully loaded", t, type, offset);
+ SUBDEBUGT(seastore_cache,
+ "{} {} is present without fully loaded", t, type, offset);
return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
}
@@ -363,7 +286,7 @@ public:
SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
t, type, offset, *ret);
t.add_to_read_set(ret);
- touch_extent(*ret, &t_src);
+ touch_extent(*ret, &t_src, t.get_cache_hint());
return ret->wait_io().then([ret] {
return get_extent_if_cached_iertr::make_ready_future<
CachedExtentRef>(ret);
@@ -394,39 +317,37 @@ public:
extent_len_t length) {
CachedExtentRef ret;
LOG_PREFIX(Cache::get_caching_extent);
+ const auto t_src = t.get_src();
auto result = t.get_extent(offset, &ret);
if (result == Transaction::get_extent_ret::RETIRED) {
- SUBERRORT(seastore_cache, "{} {}~{} is retired on t -- {}",
+ SUBERRORT(seastore_cache, "{} {}~0x{:x} is retired on t -- {}",
t, T::TYPE, offset, length, *ret);
ceph_abort("impossible");
} else if (result == Transaction::get_extent_ret::PRESENT) {
+ assert(ret->get_length() == length);
if (ret->is_fully_loaded()) {
- SUBTRACET(seastore_cache, "{} {}~{} is present on t -- {}",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} is present on t -- {}",
t, T::TYPE, offset, length, *ret);
return ret->wait_io().then([ret] {
return seastar::make_ready_future<TCachedExtentRef<T>>(
ret->cast<T>());
});
} else {
- assert(!ret->is_mutable());
- SUBDEBUGT(seastore_cache, "{} {}~{} is present on t without been \
- fully loaded, reading ... {}", t, T::TYPE, offset, length, *ret);
- auto bp = alloc_cache_buf(ret->get_length());
- ret->set_bptr(std::move(bp));
- return read_extent<T>(
- ret->cast<T>());
+ SUBDEBUGT(seastore_cache,
+ "{} {}~0x{:x} is present on t without fully loaded, reading ... -- {}",
+ t, T::TYPE, offset, length, *ret);
+ return do_read_extent_maybe_partial<T>(ret->cast<T>(), 0, length, &t_src);
}
} else {
- SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...",
t, T::TYPE, offset, length);
- auto f = [&t, this](CachedExtent &ext) {
+ auto f = [&t, this, t_src](CachedExtent &ext) {
t.add_to_read_set(CachedExtentRef(&ext));
- const auto t_src = t.get_src();
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent<T>(
- offset, length, [](T &){}, std::move(f))
+ offset, length, [](T &){}, std::move(f), &t_src)
);
}
}
@@ -435,12 +356,15 @@ public:
* get_absent_extent
*
* The extent in query is supposed to be absent in Cache.
+ * partially load buffer from partial_off~partial_len if not present.
*/
template <typename T, typename Func>
get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
Transaction &t,
paddr_t offset,
extent_len_t length,
+ extent_len_t partial_off,
+ extent_len_t partial_len,
Func &&extent_init_func) {
CachedExtentRef ret;
LOG_PREFIX(Cache::get_absent_extent);
@@ -453,13 +377,13 @@ public:
}
#endif
- SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...",
t, T::TYPE, offset, length);
- auto f = [&t, this](CachedExtent &ext) {
+ const auto t_src = t.get_src();
+ auto f = [&t, this, t_src](CachedExtent &ext) {
// FIXME: assert(ext.is_stable_clean());
assert(ext.is_stable());
assert(T::TYPE == ext.get_type());
- const auto t_src = t.get_src();
extent_access_stats_t& access_stats = get_by_ext(
get_by_src(stats.access_by_src_ext, t_src),
T::TYPE);
@@ -467,11 +391,12 @@ public:
++stats.access.s.load_absent;
t.add_to_read_set(CachedExtentRef(&ext));
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent<T>(
- offset, length, std::forward<Func>(extent_init_func), std::move(f))
+ offset, length, partial_off, partial_len,
+ std::forward<Func>(extent_init_func), std::move(f), &t_src)
);
}
@@ -495,6 +420,16 @@ public:
return get_absent_extent<T>(t, offset, length, [](T &){});
}
+ template <typename T, typename Func>
+ get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
+ Transaction &t,
+ paddr_t offset,
+ extent_len_t length,
+ Func &&extent_init_func) {
+ return get_absent_extent<T>(t, offset, length, 0, length,
+ std::forward<Func>(extent_init_func));
+ }
+
bool is_viewable_extent_stable(
Transaction &t,
CachedExtentRef extent)
@@ -513,8 +448,7 @@ public:
return view->is_data_stable();
}
- using get_extent_ertr = base_ertr;
- get_extent_ertr::future<CachedExtentRef>
+ get_extent_iertr::future<CachedExtentRef>
get_extent_viewable_by_trans(
Transaction &t,
CachedExtentRef extent)
@@ -539,7 +473,7 @@ public:
if (p_extent->is_mutable()) {
assert(p_extent->is_fully_loaded());
assert(!p_extent->is_pending_io());
- return get_extent_ertr::make_ready_future<CachedExtentRef>(
+ return get_extent_iertr::make_ready_future<CachedExtentRef>(
CachedExtentRef(p_extent));
} else {
assert(p_extent->is_exist_clean());
@@ -555,7 +489,7 @@ public:
++access_stats.cache_lru;
++stats.access.s.cache_lru;
}
- touch_extent(*p_extent, &t_src);
+ touch_extent(*p_extent, &t_src, t.get_cache_hint());
} else {
if (p_extent->is_dirty()) {
++access_stats.trans_dirty;
@@ -574,7 +508,7 @@ public:
if (extent->is_mutable()) {
assert(extent->is_fully_loaded());
assert(!extent->is_pending_io());
- return get_extent_ertr::make_ready_future<CachedExtentRef>(extent);
+ return get_extent_iertr::make_ready_future<CachedExtentRef>(extent);
} else {
assert(extent->is_exist_clean());
p_extent = extent.get();
@@ -583,40 +517,66 @@ public:
// user should not see RETIRED_PLACEHOLDER extents
ceph_assert(!is_retired_placeholder_type(p_extent->get_type()));
- if (!p_extent->is_fully_loaded()) {
- assert(!p_extent->is_mutable());
- ++access_stats.load_present;
- ++stats.access.s.load_present;
- LOG_PREFIX(Cache::get_extent_viewable_by_trans);
- SUBDEBUG(seastore_cache,
- "{} {}~{} is present without been fully loaded, reading ... -- {}",
- p_extent->get_type(), p_extent->get_paddr(),p_extent->get_length(),
- *p_extent);
- auto bp = alloc_cache_buf(p_extent->get_length());
- p_extent->set_bptr(std::move(bp));
- return read_extent<CachedExtent>(CachedExtentRef(p_extent));
- }
- return p_extent->wait_io(
- ).then([p_extent] {
- return get_extent_ertr::make_ready_future<CachedExtentRef>(
+ // for logical extents, handle partial load in TM::read_pin(),
+ // also see read_extent_maybe_partial() and get_absent_extent()
+ assert(is_logical_type(p_extent->get_type()) ||
+ p_extent->is_fully_loaded());
+
+ return trans_intr::make_interruptible(
+ p_extent->wait_io()
+ ).then_interruptible([p_extent] {
+ return get_extent_iertr::make_ready_future<CachedExtentRef>(
CachedExtentRef(p_extent));
});
}
template <typename T>
- using read_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
-
- template <typename T>
- read_extent_ret<T> get_extent_viewable_by_trans(
+ get_extent_iertr::future<TCachedExtentRef<T>>
+ get_extent_viewable_by_trans(
Transaction &t,
TCachedExtentRef<T> extent)
{
return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get())
- ).safe_then([](auto p_extent) {
+ ).si_then([](auto p_extent) {
return p_extent->template cast<T>();
});
}
+ // wait extent io or do partial reads
+ template <typename T>
+ get_extent_iertr::future<TCachedExtentRef<T>>
+ read_extent_maybe_partial(
+ Transaction &t,
+ TCachedExtentRef<T> extent,
+ extent_len_t partial_off,
+ extent_len_t partial_len) {
+ assert(is_logical_type(extent->get_type()));
+ if (!extent->is_range_loaded(partial_off, partial_len)) {
+ LOG_PREFIX(Cache::read_extent_maybe_partial);
+ SUBDEBUGT(seastore_cache,
+ "{} {}~0x{:x} is present on t without range 0x{:x}~0x{:x}, reading ... -- {}",
+ t, extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ const auto t_src = t.get_src();
+ extent_access_stats_t& access_stats = get_by_ext(
+ get_by_src(stats.access_by_src_ext, t_src),
+ extent->get_type());
+ ++access_stats.load_present;
+ ++stats.access.s.load_present;
+ return trans_intr::make_interruptible(
+ do_read_extent_maybe_partial(
+ std::move(extent), partial_off, partial_len, &t_src));
+ } else {
+ // TODO(implement fine-grained-wait):
+ // the range might be already loaded, but we don't know
+ return trans_intr::make_interruptible(
+ extent->wait_io()
+ ).then_interruptible([extent] {
+ return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>(extent);
+ });
+ }
+ }
+
extent_len_t get_block_size() const {
return epm.get_block_size();
}
@@ -628,54 +588,122 @@ public:
}
private:
+ using get_extent_ertr = base_ertr;
+ template <typename T>
+ using read_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
+ /// Implements exclusive call to read_extent() for the extent
+ template <typename T>
+ read_extent_ret<T> do_read_extent_maybe_partial(
+ TCachedExtentRef<T>&& extent,
+ extent_len_t partial_off,
+ extent_len_t partial_len,
+ const Transaction::src_t* p_src)
+ {
+ LOG_PREFIX(Cache::do_read_extent_maybe_partial);
+ // They must be atomic:
+ // 1. checking missing range and wait io
+ // 2. checking missing range and read
+ // because the extents in Caches can be accessed concurrently
+ //
+ // TODO(implement fine-grained-wait)
+ assert(!extent->is_range_loaded(partial_off, partial_len));
+ assert(!extent->is_mutable());
+ if (extent->is_pending_io()) {
+ std::optional<Transaction::src_t> src;
+ if (p_src) {
+ src = *p_src;
+ }
+ auto* p_extent = extent.get();
+ return p_extent->wait_io(
+ ).then([extent=std::move(extent), partial_off, partial_len, this, FNAME, src]() mutable
+ -> read_extent_ret<T> {
+ if (extent->is_range_loaded(partial_off, partial_len)) {
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} got range 0x{:x}~0x{:x} ... -- {}",
+ extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ // we don't know whether the target range is loading or not
+ if (extent->is_pending_io()) {
+ auto* p_extent = extent.get();
+ return p_extent->wait_io(
+ ).then([extent=std::move(extent)]() mutable {
+ return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+ });
+ } else {
+ return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+ }
+ } else { // range not loaded
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} without range 0x{:x}~0x{:x} ... -- {}",
+ extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ Transaction::src_t* p_src = (src.has_value() ? &src.value() : nullptr);
+ return do_read_extent_maybe_partial(
+ std::move(extent), partial_off, partial_len, p_src);
+ }
+ });
+ } else {
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} is not pending without range 0x{:x}~0x{:x}, reading ... -- {}",
+ extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ return read_extent<T>(
+ std::move(extent), partial_off, partial_len, p_src);
+ }
+ }
+
/**
* do_get_caching_extent
*
* returns ref to extent at offset~length of type T either from
* - extent_set if already in cache
* - disk
+ * only load partial_off~partial_len
*/
using src_ext_t = std::pair<Transaction::src_t, extent_types_t>;
template <typename T, typename Func, typename OnCache>
read_extent_ret<T> do_get_caching_extent(
paddr_t offset, ///< [in] starting addr
extent_len_t length, ///< [in] length
+ extent_len_t partial_off, ///< [in] offset of piece in extent
+ extent_len_t partial_len, ///< [in] length of piece in extent
Func &&extent_init_func, ///< [in] init func for extent
- OnCache &&on_cache
+ OnCache &&on_cache,
+ const Transaction::src_t* p_src
) {
LOG_PREFIX(Cache::do_get_caching_extent);
auto cached = query_cache(offset);
if (!cached) {
- auto ret = CachedExtent::make_cached_extent_ref<T>(
- alloc_cache_buf(length));
+ // partial read
+ TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length);
ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
offset,
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
SUBDEBUG(seastore_cache,
- "{} {}~{} is absent, add extent and reading ... -- {}",
- T::TYPE, offset, length, *ret);
+ "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
add_extent(ret);
// touch_extent() should be included in on_cache
on_cache(*ret);
extent_init_func(*ret);
return read_extent<T>(
- std::move(ret));
+ std::move(ret), partial_off, partial_len, p_src);
}
// extent PRESENT in cache
if (is_retired_placeholder_type(cached->get_type())) {
- auto ret = CachedExtent::make_cached_extent_ref<T>(
- alloc_cache_buf(length));
+ // partial read
+ TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length);
ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
offset,
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
SUBDEBUG(seastore_cache,
- "{} {}~{} is absent(placeholder), reading ... -- {}",
- T::TYPE, offset, length, *ret);
+ "{} {}~0x{:x} is absent(placeholder), add extent and reading range 0x{:x}~0x{:x} ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
extents_index.replace(*ret, *cached);
on_cache(*ret);
@@ -688,34 +716,41 @@ private:
cached->state = CachedExtent::extent_state_t::INVALID;
extent_init_func(*ret);
return read_extent<T>(
- std::move(ret));
- } else if (!cached->is_fully_loaded()) {
- auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
- on_cache(*ret);
- SUBDEBUG(seastore_cache,
- "{} {}~{} is present without been fully loaded, reading ... -- {}",
- T::TYPE, offset, length, *ret);
- auto bp = alloc_cache_buf(length);
- ret->set_bptr(std::move(bp));
- return read_extent<T>(
- std::move(ret));
- } else {
+ std::move(ret), partial_off, partial_len, p_src);
+ }
+
+ auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
+ on_cache(*ret);
+ if (ret->is_range_loaded(partial_off, partial_len)) {
SUBTRACE(seastore_cache,
- "{} {}~{} is present in cache -- {}",
- T::TYPE, offset, length, *cached);
- auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
- on_cache(*ret);
- return ret->wait_io(
- ).then([ret=std::move(ret)]() mutable
- -> read_extent_ret<T> {
+ "{} {}~0x{:x} is present with range 0x{:x}~0x{:x} ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
+ return ret->wait_io().then([ret] {
// ret may be invalid, caller must check
- return read_extent_ret<T>(
- get_extent_ertr::ready_future_marker{},
- std::move(ret));
+ return seastar::make_ready_future<TCachedExtentRef<T>>(ret);
});
+ } else {
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} is present without range 0x{:x}~0x{:x}, reading ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
+ return do_read_extent_maybe_partial(
+ std::move(ret), partial_off, partial_len, p_src);
}
}
+ template <typename T, typename Func, typename OnCache>
+ read_extent_ret<T> do_get_caching_extent(
+ paddr_t offset, ///< [in] starting addr
+ extent_len_t length, ///< [in] length
+ Func &&extent_init_func, ///< [in] init func for extent
+ OnCache &&on_cache,
+ const Transaction::src_t* p_src
+ ) {
+ return do_get_caching_extent<T>(offset, length, 0, length,
+ std::forward<Func>(extent_init_func),
+ std::forward<OnCache>(on_cache),
+ p_src);
+ }
// This is a workaround std::move_only_function not being available,
// not really worth generalizing at this time.
@@ -751,8 +786,8 @@ private:
laddr_t laddr,
extent_len_t length,
extent_init_func_t &&extent_init_func,
- extent_init_func_t &&on_cache
- );
+ extent_init_func_t &&on_cache,
+ const Transaction::src_t* p_src);
/**
* get_caching_extent_by_type
@@ -774,40 +809,39 @@ private:
extent_init_func_t &&extent_init_func
) {
LOG_PREFIX(Cache::get_caching_extent_by_type);
+ const auto t_src = t.get_src();
CachedExtentRef ret;
auto status = t.get_extent(offset, &ret);
if (status == Transaction::get_extent_ret::RETIRED) {
- SUBERRORT(seastore_cache, "{} {}~{} {} is retired on t -- {}",
+ SUBERRORT(seastore_cache, "{} {}~0x{:x} {} is retired on t -- {}",
t, type, offset, length, laddr, *ret);
ceph_abort("impossible");
} else if (status == Transaction::get_extent_ret::PRESENT) {
+ assert(ret->get_length() == length);
if (ret->is_fully_loaded()) {
- SUBTRACET(seastore_cache, "{} {}~{} {} is present on t -- {}",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is present on t -- {}",
t, type, offset, length, laddr, *ret);
return ret->wait_io().then([ret] {
return seastar::make_ready_future<CachedExtentRef>(ret);
});
} else {
- assert(!ret->is_mutable());
- SUBDEBUGT(seastore_cache, "{} {}~{} {} is present on t without been \
- fully loaded, reading ...", t, type, offset, length, laddr);
- auto bp = alloc_cache_buf(ret->get_length());
- ret->set_bptr(std::move(bp));
- return read_extent<CachedExtent>(
- std::move(ret));
+ SUBDEBUGT(seastore_cache,
+ "{} {}~0x{:x} {} is present on t without fully loaded, reading ... -- {}",
+ t, type, offset, length, laddr, *ret);
+ return do_read_extent_maybe_partial<CachedExtent>(
+ std::move(ret), 0, length, &t_src);
}
} else {
- SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...",
t, type, offset, length, laddr);
- auto f = [&t, this](CachedExtent &ext) {
+ auto f = [&t, this, t_src](CachedExtent &ext) {
t.add_to_read_set(CachedExtentRef(&ext));
- const auto t_src = t.get_src();
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent_by_type(
type, offset, laddr, length,
- std::move(extent_init_func), std::move(f))
+ std::move(extent_init_func), std::move(f), &t_src)
);
}
}
@@ -831,12 +865,12 @@ private:
}
#endif
- SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...",
t, type, offset, length, laddr);
- auto f = [&t, this](CachedExtent &ext) {
+ const auto t_src = t.get_src();
+ auto f = [&t, this, t_src](CachedExtent &ext) {
// FIXME: assert(ext.is_stable_clean());
assert(ext.is_stable());
- const auto t_src = t.get_src();
extent_access_stats_t& access_stats = get_by_ext(
get_by_src(stats.access_by_src_ext, t_src),
ext.get_type());
@@ -844,12 +878,12 @@ private:
++stats.access.s.load_absent;
t.add_to_read_set(CachedExtentRef(&ext));
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent_by_type(
type, offset, laddr, length,
- std::move(extent_init_func), std::move(f))
+ std::move(extent_init_func), std::move(f), &t_src)
);
}
@@ -871,7 +905,7 @@ private:
for (auto it = start_iter;
it != end_iter;
it++) {
- res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq);
+ res.emplace(it->paddr, it->laddr, it->len, it->type);
}
return res;
}
@@ -970,7 +1004,7 @@ public:
#endif
) {
LOG_PREFIX(Cache::alloc_new_non_data_extent);
- SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBTRACET(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, T::TYPE, length, hint, rewrite_gen_printer_t{gen});
#ifdef UNIT_TESTS_BUILT
auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen, epaddr);
@@ -978,7 +1012,8 @@ public:
auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen);
#endif
if (!result) {
- return nullptr;
+ SUBERRORT(seastore_cache, "insufficient space", t);
+ std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
}
auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result->bp));
ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
@@ -988,7 +1023,7 @@ public:
t.get_trans_id());
t.add_fresh_extent(ret);
SUBDEBUGT(seastore_cache,
- "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
+ "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}",
t, T::TYPE, length, result->paddr,
hint, rewrite_gen_printer_t{result->gen}, *ret);
return ret;
@@ -1012,13 +1047,17 @@ public:
#endif
) {
LOG_PREFIX(Cache::alloc_new_data_extents);
- SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBTRACET(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, T::TYPE, length, hint, rewrite_gen_printer_t{gen});
#ifdef UNIT_TESTS_BUILT
auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen, epaddr);
#else
auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen);
#endif
+ if (results.empty()) {
+ SUBERRORT(seastore_cache, "insufficient space", t);
+ std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
+ }
std::vector<TCachedExtentRef<T>> extents;
for (auto &result : results) {
auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
@@ -1029,7 +1068,7 @@ public:
t.get_trans_id());
t.add_fresh_extent(ret);
SUBDEBUGT(seastore_cache,
- "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
+ "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}",
t, T::TYPE, length, result.paddr,
hint, rewrite_gen_printer_t{result.gen}, *ret);
extents.emplace_back(std::move(ret));
@@ -1063,7 +1102,7 @@ public:
// (relative/temp) paddr, so make extent directly
ext = CachedExtent::make_cached_extent_ref<T>(std::move(nbp));
} else {
- ext = CachedExtent::make_placeholder_cached_extent_ref<T>(remap_length);
+ ext = CachedExtent::make_cached_extent_ref<T>(remap_length);
}
ext->init(CachedExtent::extent_state_t::EXIST_CLEAN,
@@ -1075,7 +1114,7 @@ public:
auto extent = ext->template cast<T>();
extent->set_laddr(remap_laddr);
t.add_fresh_extent(ext);
- SUBTRACET(seastore_cache, "allocated {} {}B, hint={}, has ptr? {} -- {}",
+ SUBTRACET(seastore_cache, "allocated {} 0x{:x}B, hint={}, has ptr? {} -- {}",
t, T::TYPE, remap_length, remap_laddr, original_bptr.has_value(), *extent);
return extent;
}
@@ -1218,7 +1257,7 @@ public:
{
LOG_PREFIX(Cache::init_cached_extents);
SUBINFOT(seastore_cache,
- "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
+ "start with {}(0x{:x}B) extents, {} dirty, dirty_from={}, alloc_from={}",
t,
extents_index.size(),
extents_index.get_bytes(),
@@ -1261,7 +1300,7 @@ public:
}
).si_then([this, FNAME, &t] {
SUBINFOT(seastore_cache,
- "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
+ "finish with {}(0x{:x}B) extents, {} dirty, dirty_from={}, alloc_from={}",
t,
extents_index.size(),
extents_index.get_bytes(),
@@ -1435,11 +1474,10 @@ private:
/// Update lru for access to ref
void touch_extent(
CachedExtent &ext,
- const Transaction::src_t* p_src)
+ const Transaction::src_t* p_src,
+ cache_hint_t hint)
{
- if (p_src &&
- is_background_transaction(*p_src) &&
- is_logical_type(ext.get_type())) {
+ if (hint == CACHE_HINT_NOCACHE && is_logical_type(ext.get_type())) {
return;
}
if (ext.is_stable_clean() && !ext.is_placeholder()) {
@@ -1530,22 +1568,29 @@ private:
assert(extent.is_stable_clean() && !extent.is_placeholder());
assert(extent.primary_ref_list_hook.is_linked());
assert(lru.size() > 0);
- auto extent_length = extent.get_length();
- assert(current_size >= extent_length);
+ auto extent_loaded_length = extent.get_loaded_length();
+ assert(current_size >= extent_loaded_length);
lru.erase(lru.s_iterator_to(extent));
- current_size -= extent_length;
- get_by_ext(sizes_by_ext, extent.get_type()).account_out(extent_length);
- overall_io.out_sizes.account_in(extent_length);
+ current_size -= extent_loaded_length;
+ get_by_ext(sizes_by_ext, extent.get_type()).account_out(extent_loaded_length);
+ overall_io.out_sizes.account_in(extent_loaded_length);
if (p_src) {
get_by_ext(
get_by_src(trans_io_by_src_ext, *p_src),
extent.get_type()
- ).out_sizes.account_in(extent_length);
+ ).out_sizes.account_in(extent_loaded_length);
}
intrusive_ptr_release(&extent);
}
+ void trim_to_capacity(
+ const Transaction::src_t* p_src) {
+ while (current_size > capacity) {
+ do_remove_from_lru(lru.front(), p_src);
+ }
+ }
+
public:
LRU(size_t capacity) : capacity(capacity) {}
@@ -1579,31 +1624,55 @@ private:
const Transaction::src_t* p_src) {
assert(extent.is_stable_clean() && !extent.is_placeholder());
- auto extent_length = extent.get_length();
+ auto extent_loaded_length = extent.get_loaded_length();
if (extent.primary_ref_list_hook.is_linked()) {
// present, move to top (back)
assert(lru.size() > 0);
- assert(current_size >= extent_length);
+ assert(current_size >= extent_loaded_length);
lru.erase(lru.s_iterator_to(extent));
lru.push_back(extent);
} else {
// absent, add to top (back)
- current_size += extent_length;
- get_by_ext(sizes_by_ext, extent.get_type()).account_in(extent_length);
- overall_io.in_sizes.account_in(extent_length);
+ if (extent_loaded_length > 0) {
+ current_size += extent_loaded_length;
+ get_by_ext(sizes_by_ext, extent.get_type()).account_in(extent_loaded_length);
+ overall_io.in_sizes.account_in(extent_loaded_length);
+ if (p_src) {
+ get_by_ext(
+ get_by_src(trans_io_by_src_ext, *p_src),
+ extent.get_type()
+ ).in_sizes.account_in(extent_loaded_length);
+ }
+ } // else: the extent isn't loaded upon touch_extent()/on_cache(),
+ // account the io later in increase_cached_size() upon read_extent()
+ intrusive_ptr_add_ref(&extent);
+ lru.push_back(extent);
+
+ trim_to_capacity(p_src);
+ }
+ }
+
+ void increase_cached_size(
+ CachedExtent &extent,
+ extent_len_t increased_length,
+ const Transaction::src_t* p_src) {
+ assert(!extent.is_mutable());
+
+ if (extent.primary_ref_list_hook.is_linked()) {
+ assert(extent.is_stable_clean() && !extent.is_placeholder());
+ // present, increase size
+ assert(lru.size() > 0);
+ current_size += increased_length;
+ get_by_ext(sizes_by_ext, extent.get_type()).account_in(increased_length);
+ overall_io.in_sizes.account_in(increased_length);
if (p_src) {
get_by_ext(
get_by_src(trans_io_by_src_ext, *p_src),
extent.get_type()
- ).in_sizes.account_in(extent_length);
+ ).in_sizes.account_in(increased_length);
}
- intrusive_ptr_add_ref(&extent);
- lru.push_back(extent);
- // trim to capacity
- while (current_size > capacity) {
- do_remove_from_lru(lru.front(), p_src);
- }
+ trim_to_capacity(nullptr);
}
}
@@ -1758,18 +1827,23 @@ private:
seastar::metrics::metric_group metrics;
void register_metrics();
- /// alloc buffer for cached extent
- bufferptr alloc_cache_buf(size_t size) {
- // TODO: memory pooling etc
- auto bp = ceph::bufferptr(
- buffer::create_page_aligned(size));
- bp.zero();
- return bp;
+ void apply_backref_mset(
+ backref_entry_refs_t& backref_entries) {
+ for (auto& entry : backref_entries) {
+ backref_entry_mset.insert(*entry);
+ }
}
- void backref_batch_update(
- std::vector<backref_entry_ref> &&,
- const journal_seq_t &);
+ void apply_backref_byseq(
+ backref_entry_refs_t&& backref_entries,
+ const journal_seq_t& seq);
+
+ void commit_backref_entries(
+ backref_entry_refs_t&& backref_entries,
+ const journal_seq_t& seq) {
+ apply_backref_mset(backref_entries);
+ apply_backref_byseq(std::move(backref_entries), seq);
+ }
/// Add extent to extents handling dirty and refcounting
///
@@ -1819,39 +1893,74 @@ private:
/// Introspect transaction when it is being destructed
void on_transaction_destruct(Transaction& t);
+ /// Read the extent in range offset~length,
+ /// must be called exclusively for an extent,
+ /// also see do_read_extent_maybe_partial().
+ ///
+ /// May return an invalid extent due to transaction conflict.
template <typename T>
read_extent_ret<T> read_extent(
- TCachedExtentRef<T>&& extent
+ TCachedExtentRef<T>&& extent,
+ extent_len_t offset,
+ extent_len_t length,
+ const Transaction::src_t* p_src
) {
+ LOG_PREFIX(Cache::read_extent);
assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING ||
- extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
- extent->state == CachedExtent::extent_state_t::CLEAN);
+ extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
+ extent->state == CachedExtent::extent_state_t::CLEAN);
+ assert(!extent->is_range_loaded(offset, length));
+ assert(is_aligned(offset, get_block_size()));
+ assert(is_aligned(length, get_block_size()));
extent->set_io_wait();
- return epm.read(
- extent->get_paddr(),
- extent->get_length(),
- extent->get_bptr()
- ).safe_then(
- [extent=std::move(extent), this]() mutable {
- LOG_PREFIX(Cache::read_extent);
- if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
- extent->state = CachedExtent::extent_state_t::CLEAN;
- }
- ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN
- || extent->state == CachedExtent::extent_state_t::CLEAN
- || !extent->is_valid());
- if (extent->is_valid()) {
- // crc will be checked against LBA leaf entry for logical extents,
- // or check against in-extent crc for physical extents.
- if (epm.get_checksum_needed(extent->get_paddr())) {
- extent->last_committed_crc = extent->calc_crc32c();
- } else {
- extent->last_committed_crc = CRC_NULL;
- }
- extent->on_clean_read();
- }
+ auto old_length = extent->get_loaded_length();
+ load_ranges_t to_read = extent->load_ranges(offset, length);
+ auto new_length = extent->get_loaded_length();
+ assert(new_length > old_length);
+ lru.increase_cached_size(*extent, new_length - old_length, p_src);
+ return seastar::do_with(to_read.ranges, [extent, this, FNAME](auto &read_ranges) {
+ return ExtentPlacementManager::read_ertr::parallel_for_each(
+ read_ranges, [extent, this, FNAME](auto &read_range) {
+ SUBDEBUG(seastore_cache, "reading extent {} 0x{:x}~0x{:x} ...",
+ extent->get_paddr(), read_range.offset, read_range.get_length());
+ assert(is_aligned(read_range.offset, get_block_size()));
+ assert(is_aligned(read_range.get_length(), get_block_size()));
+ return epm.read(
+ extent->get_paddr() + read_range.offset,
+ read_range.get_length(),
+ read_range.ptr);
+ });
+ }).safe_then(
+ [this, FNAME, extent=std::move(extent), offset, length]() mutable {
+ if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
+ extent->state = CachedExtent::extent_state_t::CLEAN;
+ }
+ ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN
+ || extent->state == CachedExtent::extent_state_t::CLEAN
+ || !extent->is_valid());
+ if (extent->is_valid()) {
+ if (extent->is_fully_loaded()) {
+ // crc will be checked against LBA leaf entry for logical extents,
+ // or check against in-extent crc for physical extents.
+ if (epm.get_checksum_needed(extent->get_paddr())) {
+ extent->last_committed_crc = extent->calc_crc32c();
+ } else {
+ extent->last_committed_crc = CRC_NULL;
+ }
+ // on_clean_read() may change the content, call after calc_crc32c()
+ extent->on_clean_read();
+ SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done -- {}",
+ offset, length, *extent);
+ } else {
+ extent->last_committed_crc = CRC_NULL;
+ SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (partial) -- {}",
+ offset, length, *extent);
+ }
+ } else {
+ SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (invalidated) -- {}",
+ offset, length, *extent);
+ }
extent->complete_io();
- SUBDEBUG(seastore_cache, "read extent done -- {}", *extent);
return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
std::move(extent));
},
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
index 76c18bde667..49fede1d9a8 100644
--- a/src/crimson/os/seastore/cached_extent.cc
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -7,6 +7,7 @@
#include "crimson/common/log.h"
#include "crimson/os/seastore/btree/fixed_kv_node.h"
+#include "crimson/os/seastore/lba_mapping.h"
namespace {
[[maybe_unused]] seastar::logger& logger() {
@@ -38,12 +39,6 @@ void intrusive_ptr_release(CachedExtent *ptr)
#endif
-bool is_backref_mapped_extent_node(const CachedExtentRef &extent) {
- return extent->is_logical()
- || is_lba_node(extent->get_type())
- || extent->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL;
-}
-
std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state)
{
switch (state) {
@@ -94,15 +89,15 @@ CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) {
}
std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) {
- return out << "parent_tracker=" << (void*)&tracker
- << ", parent=" << (void*)tracker.get_parent().get();
+ return out << "tracker_ptr=" << (void*)&tracker
+ << ", parent_ptr=" << (void*)tracker.get_parent().get();
}
std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const {
if (parent_tracker) {
- out << *parent_tracker;
+ out << ", parent_tracker(" << *parent_tracker << ")";
} else {
- out << ", parent_tracker=" << (void*)nullptr;
+ out << ", parent_tracker(nullptr)";
}
_print_detail(out);
return out;
@@ -148,6 +143,12 @@ void LogicalCachedExtent::on_replace_prior() {
parent->children[off] = this;
}
+void LogicalCachedExtent::maybe_set_intermediate_laddr(LBAMapping &mapping) {
+ laddr = mapping.is_indirect()
+ ? mapping.get_intermediate_base()
+ : mapping.get_key();
+}
+
parent_tracker_t::~parent_tracker_t() {
// this is parent's tracker, reset it
auto &p = (FixedKVNode<laddr_t>&)*parent;
@@ -156,30 +157,183 @@ parent_tracker_t::~parent_tracker_t() {
}
}
-std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
+bool BufferSpace::is_range_loaded(extent_len_t offset, extent_len_t length) const
{
- out << "LBAMapping(" << rhs.get_key()
- << "~0x" << std::hex << rhs.get_length() << std::dec
- << "->" << rhs.get_val();
- if (rhs.is_indirect()) {
- out << ",indirect(" << rhs.get_intermediate_base()
- << "~0x" << std::hex << rhs.get_intermediate_length()
- << "@0x" << rhs.get_intermediate_offset() << std::dec
- << ")";
+ assert(length > 0);
+ auto i = buffer_map.upper_bound(offset);
+ if (i == buffer_map.begin()) {
+ return false;
}
- out << ")";
- return out;
+ --i;
+ auto& [i_offset, i_bl] = *i;
+ assert(offset >= i_offset);
+ assert(i_bl.length() > 0);
+ if (offset + length > i_offset + i_bl.length()) {
+ return false;
+ } else {
+ return true;
+ }
+}
+
+ceph::bufferlist BufferSpace::get_buffer(extent_len_t offset, extent_len_t length) const
+{
+ assert(length > 0);
+ auto i = buffer_map.upper_bound(offset);
+ assert(i != buffer_map.begin());
+ --i;
+ auto& [i_offset, i_bl] = *i;
+ assert(offset >= i_offset);
+ assert(i_bl.length() > 0);
+ assert(offset + length <= i_offset + i_bl.length());
+ ceph::bufferlist res;
+ res.substr_of(i_bl, offset - i_offset, length);
+ return res;
+}
+
+load_ranges_t BufferSpace::load_ranges(extent_len_t offset, extent_len_t length)
+{
+ assert(length > 0);
+ load_ranges_t ret;
+ auto next = buffer_map.upper_bound(offset);
+
+ // must be assigned for the main-loop
+ map_t::iterator previous;
+ extent_len_t range_offset;
+ extent_len_t range_length;
+
+ // returns whether to proceed main-loop or not
+ auto f_merge_next_check_hole = [this, &next, &range_offset, &range_length](
+ ceph::bufferlist& previous_bl,
+ extent_len_t hole_length,
+ extent_len_t next_offset,
+ const ceph::bufferlist& next_bl) {
+ range_length -= hole_length;
+ previous_bl.append(next_bl);
+ if (range_length <= next_bl.length()) {
+ // "next" end includes or beyonds the range
+ buffer_map.erase(next);
+ return false;
+ } else {
+ range_offset = next_offset + next_bl.length();
+ range_length -= next_bl.length();
+ // erase next should destruct next_bl
+ next = buffer_map.erase(next);
+ return true;
+ }
+ };
+
+ // returns whether to proceed main-loop or not
+ auto f_prepare_without_merge_previous = [
+ this, offset, length,
+ &ret, &previous, &next, &range_length,
+ &f_merge_next_check_hole]() {
+ if (next == buffer_map.end()) {
+ // "next" reaches end,
+ // range has no "next" to merge
+ create_hole_insert_map(ret, offset, length, next);
+ return false;
+ }
+ // "next" is valid
+ auto& [n_offset, n_bl] = *next;
+ // next is from upper_bound()
+ assert(offset < n_offset);
+ extent_len_t hole_length = n_offset - offset;
+ if (length < hole_length) {
+ // "next" is beyond the range end,
+ // range has no "next" to merge
+ create_hole_insert_map(ret, offset, length, next);
+ return false;
+ }
+ // length >= hole_length
+ // insert hole as "previous"
+ previous = create_hole_insert_map(ret, offset, hole_length, next);
+ auto& p_bl = previous->second;
+ range_length = length;
+ return f_merge_next_check_hole(p_bl, hole_length, n_offset, n_bl);
+ };
+
+ /*
+ * prepare main-loop
+ */
+ if (next == buffer_map.begin()) {
+ // "previous" is invalid
+ if (!f_prepare_without_merge_previous()) {
+ return ret;
+ }
+ } else {
+ // "previous" is valid
+ previous = std::prev(next);
+ auto& [p_offset, p_bl] = *previous;
+ assert(offset >= p_offset);
+ extent_len_t p_end = p_offset + p_bl.length();
+ if (offset <= p_end) {
+ // "previous" is adjacent or overlaps the range
+ range_offset = p_end;
+ assert(offset + length > p_end);
+ range_length = offset + length - p_end;
+ // start the main-loop (merge "previous")
+ } else {
+ // "previous" is not adjacent to the range
+ // range and buffer_map should not overlap
+ assert(offset > p_end);
+ if (!f_prepare_without_merge_previous()) {
+ return ret;
+ }
+ }
+ }
+
+ /*
+ * main-loop: merge the range with "previous" and look at "next"
+ *
+ * "previous": the previous buffer_map entry, must be valid, must be mergable
+ * "next": the next buffer_map entry, maybe end, maybe mergable
+ * range_offset/length: the current range right after "previous"
+ */
+ assert(std::next(previous) == next);
+ auto& [p_offset, p_bl] = *previous;
+ assert(range_offset == p_offset + p_bl.length());
+ assert(range_length > 0);
+ while (next != buffer_map.end()) {
+ auto& [n_offset, n_bl] = *next;
+ assert(range_offset < n_offset);
+ extent_len_t hole_length = n_offset - range_offset;
+ if (range_length < hole_length) {
+ // "next" offset is beyond the range end
+ break;
+ }
+ // range_length >= hole_length
+ create_hole_append_bl(ret, p_bl, range_offset, hole_length);
+ if (!f_merge_next_check_hole(p_bl, hole_length, n_offset, n_bl)) {
+ return ret;
+ }
+ assert(std::next(previous) == next);
+ assert(range_offset == p_offset + p_bl.length());
+ assert(range_length > 0);
+ }
+ // range has no "next" to merge:
+ // 1. "next" reaches end
+ // 2. "next" offset is beyond the range end
+ create_hole_append_bl(ret, p_bl, range_offset, range_length);
+ return ret;
}
-std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+ceph::bufferptr BufferSpace::to_full_ptr(extent_len_t length)
{
- bool first = true;
- out << '[';
- for (const auto &i: rhs) {
- out << (first ? "" : ",") << *i;
- first = false;
+ assert(length > 0);
+ assert(buffer_map.size() == 1);
+ auto it = buffer_map.begin();
+ auto& [i_off, i_buf] = *it;
+ assert(i_off == 0);
+ if (!i_buf.is_contiguous()) {
+ // Allocate page aligned ptr, also see create_extent_ptr_*()
+ i_buf.rebuild();
}
- return out << ']';
+ assert(i_buf.get_num_buffers() == 1);
+ ceph::bufferptr ptr(i_buf.front());
+ assert(ptr.is_page_aligned());
+ assert(ptr.length() == length);
+ buffer_map.clear();
+ return ptr;
}
}
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 6025725aa33..9dc60d719eb 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -6,15 +6,15 @@
#include <iostream>
#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/set.hpp>
#include <boost/intrusive_ptr.hpp>
#include <boost/smart_ptr/intrusive_ref_counter.hpp>
#include "seastar/core/shared_future.hh"
#include "include/buffer.h"
-#include "crimson/common/errorator.h"
-#include "crimson/common/interruptible_future.h"
#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_interruptor.h"
struct btree_lba_manager_test;
struct lba_btree_test;
@@ -23,7 +23,6 @@ struct cache_test_t;
namespace crimson::os::seastore {
-class Transaction;
class CachedExtent;
using CachedExtentRef = boost::intrusive_ptr<CachedExtent>;
class SegmentedAllocator;
@@ -41,6 +40,20 @@ void intrusive_ptr_release(CachedExtent *);
#endif
+// Note: BufferSpace::to_full_ptr() also creates extent ptr.
+
+inline ceph::bufferptr create_extent_ptr_rand(extent_len_t len) {
+ assert(is_aligned(len, CEPH_PAGE_SIZE));
+ assert(len > 0);
+ return ceph::bufferptr(buffer::create_page_aligned(len));
+}
+
+inline ceph::bufferptr create_extent_ptr_zero(extent_len_t len) {
+ auto bp = create_extent_ptr_rand(len);
+ bp.zero();
+ return bp;
+}
+
template <typename T>
using TCachedExtentRef = boost::intrusive_ptr<T>;
@@ -155,6 +168,85 @@ struct trans_spec_view_t {
boost::intrusive::compare<cmp_t>>;
};
+struct load_range_t {
+ extent_len_t offset;
+ ceph::bufferptr ptr;
+
+ extent_len_t get_length() const {
+ return ptr.length();
+ }
+
+ extent_len_t get_end() const {
+ extent_len_t end = offset + ptr.length();
+ assert(end > offset);
+ return end;
+ }
+};
+struct load_ranges_t {
+ extent_len_t length = 0;
+ std::list<load_range_t> ranges;
+
+ void push_back(extent_len_t offset, ceph::bufferptr ptr) {
+ assert(ranges.empty() ||
+ (ranges.back().get_end() < offset));
+ assert(ptr.length());
+ length += ptr.length();
+ ranges.push_back({offset, std::move(ptr)});
+ }
+};
+
+/// manage small chunks of extent
+class BufferSpace {
+ using map_t = std::map<extent_len_t, ceph::bufferlist>;
+public:
+ BufferSpace() = default;
+
+ /// Returns true if offset~length is fully loaded
+ bool is_range_loaded(extent_len_t offset, extent_len_t length) const;
+
+ /// Returns the bufferlist of offset~length
+ ceph::bufferlist get_buffer(extent_len_t offset, extent_len_t length) const;
+
+ /// Returns the ranges to load, merge the buffer_map if possible
+ load_ranges_t load_ranges(extent_len_t offset, extent_len_t length);
+
+ /// Converts to ptr when fully loaded
+ ceph::bufferptr to_full_ptr(extent_len_t length);
+
+private:
+ // create and append the read-hole to
+ // load_ranges_t and bl
+ static void create_hole_append_bl(
+ load_ranges_t& ret,
+ ceph::bufferlist& bl,
+ extent_len_t hole_offset,
+ extent_len_t hole_length) {
+ ceph::bufferptr hole_ptr = create_extent_ptr_rand(hole_length);
+ bl.append(hole_ptr);
+ ret.push_back(hole_offset, std::move(hole_ptr));
+ }
+
+ // create and insert the read-hole to buffer_map,
+ // and append to load_ranges_t
+ // returns the iterator containing the inserted read-hole
+ auto create_hole_insert_map(
+ load_ranges_t& ret,
+ extent_len_t hole_offset,
+ extent_len_t hole_length,
+ const map_t::const_iterator& next_it) {
+ assert(!buffer_map.contains(hole_offset));
+ ceph::bufferlist bl;
+ create_hole_append_bl(ret, bl, hole_offset, hole_length);
+ auto it = buffer_map.insert(
+ next_it, std::pair{hole_offset, std::move(bl)});
+ assert(next_it == std::next(it));
+ return it;
+ }
+
+ /// extent offset -> buffer, won't overlap nor contiguous
+ map_t buffer_map;
+};
+
class ExtentIndex;
class CachedExtent
: public boost::intrusive_ref_counter<
@@ -256,6 +348,17 @@ public:
virtual void on_initial_write() {}
/**
+ * on_fully_loaded
+ *
+ * Called when ptr is ready. Normally this should be used to initiate
+ * the extent to be identical to CachedExtent(ptr).
+ *
+ * Note this doesn't mean the content is fully read, use on_clean_read for
+ * this purpose.
+ */
+ virtual void on_fully_loaded() {}
+
+ /**
* on_clean_read
*
* Called after read of initially written extent.
@@ -350,12 +453,12 @@ public:
<< ", modify_time=" << sea_time_point_printer_t{modify_time}
<< ", paddr=" << get_paddr()
<< ", prior_paddr=" << prior_poffset_str
- << std::hex << ", length=0x" << get_length() << std::dec
+ << std::hex << ", length=0x" << get_length()
+ << ", loaded=0x" << get_loaded_length() << std::dec
<< ", state=" << state
<< ", last_committed_crc=" << last_committed_crc
<< ", refcount=" << use_count()
<< ", user_hint=" << user_hint
- << ", fully_loaded=" << is_fully_loaded()
<< ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation};
if (state != extent_state_t::INVALID &&
state != extent_state_t::CLEAN_PENDING) {
@@ -537,7 +640,40 @@ public:
/// Return true if extent is fully loaded or is about to be fully loaded (call
/// wait_io() in this case)
bool is_fully_loaded() const {
- return ptr.has_value();
+ if (ptr.has_value()) {
+ // length == 0 iff root
+ assert(length == loaded_length);
+ assert(!buffer_space.has_value());
+ return true;
+ } else { // ptr is std::nullopt
+ assert(length > loaded_length);
+ assert(buffer_space.has_value());
+ return false;
+ }
+ }
+
+ /// Return true if range offset~_length is loaded
+ bool is_range_loaded(extent_len_t offset, extent_len_t _length) {
+ assert(is_aligned(offset, CEPH_PAGE_SIZE));
+ assert(is_aligned(_length, CEPH_PAGE_SIZE));
+ assert(_length > 0);
+ assert(offset + _length <= length);
+ if (is_fully_loaded()) {
+ return true;
+ }
+ return buffer_space->is_range_loaded(offset, _length);
+ }
+
+ /// Get buffer by given offset and _length.
+ ceph::bufferlist get_range(extent_len_t offset, extent_len_t _length) {
+ assert(is_range_loaded(offset, _length));
+ ceph::bufferlist res;
+ if (is_fully_loaded()) {
+ res.append(ceph::bufferptr(get_bptr(), offset, _length));
+ } else {
+ res = buffer_space->get_buffer(offset, _length);
+ }
+ return res;
}
/**
@@ -553,12 +689,9 @@ public:
return length;
}
+ /// Returns length of partially loaded extent data in cache
extent_len_t get_loaded_length() const {
- if (ptr.has_value()) {
- return ptr->length();
- } else {
- return 0;
- }
+ return loaded_length;
}
/// Returns version, get_version() == 0 iff is_clean()
@@ -697,12 +830,19 @@ private:
*/
journal_seq_t dirty_from_or_retired_at;
- /// cache data contents, std::nullopt if no data in cache
+ /// cache data contents, std::nullopt iff partially loaded
std::optional<ceph::bufferptr> ptr;
- /// disk data length
+ /// disk data length, 0 iff root
extent_len_t length;
+ /// loaded data length, <length iff partially loaded
+ extent_len_t loaded_length;
+
+ /// manager of buffer pieces for ObjectDataBLock
+ /// valid iff partially loaded
+ std::optional<BufferSpace> buffer_space;
+
/// number of deltas since initial write
extent_version_t version = 0;
@@ -748,9 +888,29 @@ protected:
trans_view_set_t retired_transactions;
CachedExtent(CachedExtent &&other) = delete;
- CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) {
- length = ptr->length();
+
+ /// construct a fully loaded CachedExtent
+ explicit CachedExtent(ceph::bufferptr &&_ptr)
+ : length(_ptr.length()),
+ loaded_length(_ptr.length()) {
+ ptr = std::move(_ptr);
+
+ assert(ptr->is_page_aligned());
+ assert(length > 0);
+ assert(is_fully_loaded());
+ // must call init() to fully initialize
+ }
+
+ /// construct a partially loaded CachedExtent
+ /// must be identical with CachedExtent(ptr) after on_fully_loaded()
+ explicit CachedExtent(extent_len_t _length)
+ : length(_length),
+ loaded_length(0),
+ buffer_space(std::in_place) {
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
assert(length > 0);
+ assert(!is_fully_loaded());
+ // must call init() to fully initialize
}
/// construct new CachedExtent, will deep copy the buffer
@@ -758,16 +918,20 @@ protected:
: state(other.state),
dirty_from_or_retired_at(other.dirty_from_or_retired_at),
length(other.get_length()),
+ loaded_length(other.get_loaded_length()),
version(other.version),
poffset(other.poffset) {
- assert((length % CEPH_PAGE_SIZE) == 0);
- if (other.is_fully_loaded()) {
- ptr.emplace(buffer::create_page_aligned(length));
- other.ptr->copy_out(0, length, ptr->c_str());
- } else {
- // the extent must be fully loaded before CoW
- assert(length == 0); // in case of root
- }
+ // the extent must be fully loaded before CoW
+ assert(other.is_fully_loaded());
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
+ if (length > 0) {
+ ptr = create_extent_ptr_rand(length);
+ other.ptr->copy_out(0, length, ptr->c_str());
+ } else { // length == 0, must be root
+ ptr = ceph::bufferptr(0);
+ }
+
+ assert(is_fully_loaded());
}
struct share_buffer_t {};
@@ -777,23 +941,35 @@ protected:
dirty_from_or_retired_at(other.dirty_from_or_retired_at),
ptr(other.ptr),
length(other.get_length()),
+ loaded_length(other.get_loaded_length()),
version(other.version),
- poffset(other.poffset) {}
+ poffset(other.poffset) {
+ // the extent must be fully loaded before CoW
+ assert(other.is_fully_loaded());
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
+ assert(length > 0);
+ assert(is_fully_loaded());
+ }
// 0 length is only possible for the RootBlock
- struct zero_length_t {};
- CachedExtent(zero_length_t) : ptr(ceph::bufferptr(0)), length(0) {};
-
- struct retired_placeholder_t{};
- CachedExtent(retired_placeholder_t, extent_len_t _length)
- : state(extent_state_t::CLEAN),
- length(_length) {
- assert(length > 0);
+ struct root_construct_t {};
+ CachedExtent(root_construct_t)
+ : ptr(ceph::bufferptr(0)),
+ length(0),
+ loaded_length(0) {
+ assert(is_fully_loaded());
+ // must call init() to fully initialize
}
- /// no buffer extent, for lazy read
- CachedExtent(extent_len_t _length) : length(_length) {
- assert(length > 0);
+ struct retired_placeholder_construct_t {};
+ CachedExtent(retired_placeholder_construct_t, extent_len_t _length)
+ : state(extent_state_t::CLEAN),
+ length(_length),
+ loaded_length(0),
+ buffer_space(std::in_place) {
+ assert(!is_fully_loaded());
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
+ // must call init() to fully initialize
}
friend class Cache;
@@ -804,9 +980,8 @@ protected:
}
template <typename T>
- static TCachedExtentRef<T> make_placeholder_cached_extent_ref(
- extent_len_t length) {
- return new T(length);
+ static TCachedExtentRef<T> make_cached_extent_ref() {
+ return new T();
}
void reset_prior_instance() {
@@ -869,6 +1044,45 @@ protected:
}
}
+ /// Returns the ranges to load, convert to fully loaded is possible
+ load_ranges_t load_ranges(extent_len_t offset, extent_len_t _length) {
+ assert(is_aligned(offset, CEPH_PAGE_SIZE));
+ assert(is_aligned(_length, CEPH_PAGE_SIZE));
+ assert(_length > 0);
+ assert(offset + _length <= length);
+ assert(!is_fully_loaded());
+
+ if (loaded_length == 0 && _length == length) {
+ assert(offset == 0);
+ // skip rebuilding the buffer from buffer_space
+ ptr = create_extent_ptr_rand(length);
+ loaded_length = _length;
+ buffer_space.reset();
+ assert(is_fully_loaded());
+ on_fully_loaded();
+ load_ranges_t ret;
+ ret.push_back(offset, *ptr);
+ return ret;
+ }
+
+ load_ranges_t ret = buffer_space->load_ranges(offset, _length);
+ loaded_length += ret.length;
+ assert(length >= loaded_length);
+ if (length == loaded_length) {
+ // convert to fully loaded
+ ptr = buffer_space->to_full_ptr(length);
+ buffer_space.reset();
+ assert(is_fully_loaded());
+ on_fully_loaded();
+ // adjust ret since the ptr has been rebuild
+ for (load_range_t& range : ret.ranges) {
+ auto range_length = range.ptr.length();
+ range.ptr = ceph::bufferptr(*ptr, range.offset, range_length);
+ }
+ }
+ return ret;
+ }
+
friend class crimson::os::seastore::SegmentedAllocator;
friend class crimson::os::seastore::TransactionManager;
friend class crimson::os::seastore::ExtentPlacementManager;
@@ -883,8 +1097,6 @@ protected:
std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t);
std::ostream &operator<<(std::ostream &, const CachedExtent&);
-bool is_backref_mapped_extent_node(const CachedExtentRef &extent);
-
/// Compare extents by paddr
struct paddr_cmp {
bool operator()(paddr_t lhs, const CachedExtent &rhs) const {
@@ -1067,7 +1279,6 @@ private:
};
class ChildableCachedExtent;
-class LogicalCachedExtent;
class child_pos_t {
public:
@@ -1088,14 +1299,17 @@ private:
uint16_t pos = std::numeric_limits<uint16_t>::max();
};
-using get_child_ertr = crimson::errorator<
- crimson::ct_error::input_output_error>;
+using get_child_iertr = trans_iertr<crimson::errorator<
+ crimson::ct_error::input_output_error>>;
+template <typename T>
+using get_child_ifut = get_child_iertr::future<TCachedExtentRef<T>>;
+
template <typename T>
struct get_child_ret_t {
- std::variant<child_pos_t, get_child_ertr::future<TCachedExtentRef<T>>> ret;
+ std::variant<child_pos_t, get_child_ifut<T>> ret;
get_child_ret_t(child_pos_t pos)
: ret(std::move(pos)) {}
- get_child_ret_t(get_child_ertr::future<TCachedExtentRef<T>> child)
+ get_child_ret_t(get_child_ifut<T> child)
: ret(std::move(child)) {}
bool has_child() const {
@@ -1107,7 +1321,7 @@ struct get_child_ret_t {
return std::get<0>(ret);
}
- get_child_ertr::future<TCachedExtentRef<T>> &get_child_fut() {
+ get_child_ifut<T> &get_child_fut() {
ceph_assert(ret.index() == 1);
return std::get<1>(ret);
}
@@ -1122,48 +1336,18 @@ using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t>
template <typename key_t, typename val_t>
class PhysicalNodeMapping {
public:
+ PhysicalNodeMapping() = default;
+ PhysicalNodeMapping(const PhysicalNodeMapping&) = delete;
virtual extent_len_t get_length() const = 0;
- virtual extent_types_t get_type() const = 0;
virtual val_t get_val() const = 0;
virtual key_t get_key() const = 0;
- virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0;
- virtual PhysicalNodeMappingRef<key_t, val_t> refresh_with_pending_parent() {
- ceph_abort("impossible");
- return {};
- }
virtual bool has_been_invalidated() const = 0;
virtual CachedExtentRef get_parent() const = 0;
virtual uint16_t get_pos() const = 0;
- // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
- virtual bool is_indirect() const { return false; }
- virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; }
- virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; }
- virtual extent_len_t get_intermediate_length() const { return 0; }
virtual uint32_t get_checksum() const {
ceph_abort("impossible");
return 0;
}
- // The start offset of the pin, must be 0 if the pin is not indirect
- virtual extent_len_t get_intermediate_offset() const {
- return std::numeric_limits<extent_len_t>::max();
- }
-
- virtual get_child_ret_t<LogicalCachedExtent>
- get_logical_extent(Transaction &t) = 0;
-
- void link_child(ChildableCachedExtent *c) {
- ceph_assert(child_pos);
- child_pos->link_child(c);
- }
-
- // For reserved mappings, the return values are
- // undefined although it won't crash
- virtual bool is_stable() const = 0;
- virtual bool is_data_stable() const = 0;
- virtual bool is_clone() const = 0;
- bool is_zero_reserved() const {
- return !get_val().is_real();
- }
virtual bool is_parent_viewable() const = 0;
virtual bool is_parent_valid() const = 0;
virtual bool parent_modified() const {
@@ -1176,24 +1360,8 @@ public:
}
virtual ~PhysicalNodeMapping() {}
-protected:
- std::optional<child_pos_t> child_pos = std::nullopt;
};
-using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>;
-using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>;
-
-std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs);
-
-using lba_pin_list_t = std::list<LBAMappingRef>;
-
-std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
-
-using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>;
-using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>;
-
-using backref_pin_list_t = std::list<BackrefMappingRef>;
-
/**
* RetiredExtentPlaceholder
*
@@ -1209,7 +1377,7 @@ class RetiredExtentPlaceholder : public CachedExtent {
public:
RetiredExtentPlaceholder(extent_len_t length)
- : CachedExtent(CachedExtent::retired_placeholder_t{}, length) {}
+ : CachedExtent(CachedExtent::retired_placeholder_construct_t{}, length) {}
CachedExtentRef duplicate_for_write(Transaction&) final {
ceph_assert(0 == "Should never happen for a placeholder");
@@ -1307,6 +1475,8 @@ private:
return out;
}
};
+
+class LBAMapping;
/**
* LogicalCachedExtent
*
@@ -1341,11 +1511,7 @@ public:
laddr = nladdr;
}
- void maybe_set_intermediate_laddr(LBAMapping &mapping) {
- laddr = mapping.is_indirect()
- ? mapping.get_intermediate_base()
- : mapping.get_key();
- }
+ void maybe_set_intermediate_laddr(LBAMapping &mapping);
void apply_delta_and_adjust_crc(
paddr_t base, const ceph::bufferlist &bl) final {
@@ -1445,8 +1611,6 @@ using lextent_list_t = addr_extent_list_base_t<
}
#if FMT_VERSION >= 90000
-template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {};
#endif
diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.h b/src/crimson/os/seastore/collection_manager/collection_flat_node.h
index aa1e7135613..1f4de652bba 100644
--- a/src/crimson/os/seastore/collection_manager/collection_flat_node.h
+++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.h
@@ -96,6 +96,8 @@ struct CollectionNode
explicit CollectionNode(ceph::bufferptr &&ptr)
: LogicalCachedExtent(std::move(ptr)) {}
+ explicit CollectionNode(extent_len_t length)
+ : LogicalCachedExtent(length) {}
explicit CollectionNode(const CollectionNode &other)
: LogicalCachedExtent(other),
decoded(other.decoded) {}
diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
index c32dc66619a..866b5bf350c 100644
--- a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
+++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
@@ -51,8 +51,11 @@ FlatCollectionManager::get_coll_root(const coll_root_t &coll_root, Transaction &
cc.t,
coll_root.get_location(),
coll_root.get_size()
- ).si_then([](auto&& e) {
- return get_root_iertr::make_ready_future<CollectionNodeRef>(std::move(e));
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return get_root_iertr::make_ready_future<CollectionNodeRef>(
+ std::move(maybe_indirect_extent.extent));
});
}
diff --git a/src/crimson/os/seastore/device.cc b/src/crimson/os/seastore/device.cc
index c3bda82a7f6..cc83eb54826 100644
--- a/src/crimson/os/seastore/device.cc
+++ b/src/crimson/os/seastore/device.cc
@@ -12,7 +12,7 @@ namespace crimson::os::seastore {
std::ostream& operator<<(std::ostream& out, const device_spec_t& ds)
{
return out << "device_spec("
- << "magic=" << ds.magic
+ << "magic=0x" << std::hex << ds.magic << std::dec
<< ", dtype=" << ds.dtype
<< ", " << device_id_printer_t{ds.id}
<< ")";
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
index 0458fbfed74..fd19eeb7e58 100644
--- a/src/crimson/os/seastore/extent_placement_manager.cc
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -1069,8 +1069,8 @@ RandomBlockOolWriter::do_write(
w_info.bp = bp;
writes.push_back(w_info);
}
- TRACE("current extent: base off {} len {},\
- maybe-merged current extent: base off {} len {}",
+ TRACE("current extent: {}~0x{:x},\
+ maybe-merged current extent: {}~0x{:x}",
paddr, ex->get_length(), writes.back().offset, writes.back().bp.length());
}
diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h
index c4e98a5f4a1..4ff9729c5f4 100644
--- a/src/crimson/os/seastore/extent_placement_manager.h
+++ b/src/crimson/os/seastore/extent_placement_manager.h
@@ -236,9 +236,9 @@ struct io_usage_t {
cleaner_usage_t cleaner_usage;
friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) {
return out << "io_usage_t("
- << "inline_usage=" << usage.inline_usage
- << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage
- << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage
+ << "inline_usage=0x" << std::hex << usage.inline_usage
+ << ", main_cleaner_usage=0x" << usage.cleaner_usage.main_usage
+ << ", cold_cleaner_usage=0x" << usage.cleaner_usage.cold_ool_usage << std::dec
<< ")";
}
};
@@ -371,9 +371,7 @@ public:
// XXX: bp might be extended to point to different memory (e.g. PMem)
// according to the allocator.
- auto bp = ceph::bufferptr(
- buffer::create_page_aligned(length));
- bp.zero();
+ auto bp = create_extent_ptr_zero(length);
return alloc_result_t{addr, std::move(bp), gen};
}
@@ -405,9 +403,7 @@ public:
#ifdef UNIT_TESTS_BUILT
if (unlikely(external_paddr.has_value())) {
assert(external_paddr->is_fake());
- auto bp = ceph::bufferptr(
- buffer::create_page_aligned(length));
- bp.zero();
+ auto bp = create_extent_ptr_zero(length);
allocs.emplace_back(alloc_result_t{*external_paddr, std::move(bp), gen});
} else {
#else
@@ -418,15 +414,17 @@ public:
for (auto &ext : addrs) {
auto left = ext.len;
while (left > 0) {
- auto len = std::min(max_data_allocation_size, left);
- auto bp = ceph::bufferptr(buffer::create_page_aligned(len));
- bp.zero();
+ auto len = left;
+ if (max_data_allocation_size) {
+ len = std::min(max_data_allocation_size, len);
+ }
+ auto bp = create_extent_ptr_zero(len);
auto start = ext.start.is_delayed()
? ext.start
: ext.start + (ext.len - left);
allocs.emplace_back(alloc_result_t{start, std::move(bp), gen});
SUBDEBUGT(seastore_epm,
- "allocated {} {}B extent at {}, hint={}, gen={}",
+ "allocated {} 0x{:x}B extent at {}, hint={}, gen={}",
t, type, len, start, hint, gen);
left -= len;
}
diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc
deleted file mode 100644
index b0dc1b8c8a8..00000000000
--- a/src/crimson/os/seastore/extentmap_manager.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#include <experimental/iterator>
-#include <iostream>
-
-#include "crimson/os/seastore/transaction_manager.h"
-#include "crimson/os/seastore/extentmap_manager.h"
-#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
-namespace crimson::os::seastore::extentmap_manager {
-
-ExtentMapManagerRef create_extentmap_manager(
- TransactionManager &trans_manager) {
- return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager));
-}
-
-}
-
-namespace crimson::os::seastore {
-
-std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs)
-{
- return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length
- << "->" << rhs.laddr << ")";
-}
-
-std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs)
-{
- out << '[';
- std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
- return out << ']';
-}
-
-}
diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h
index a5c9029c43c..298935bd22e 100644
--- a/src/crimson/os/seastore/journal.h
+++ b/src/crimson/os/seastore/journal.h
@@ -59,13 +59,13 @@ public:
crimson::ct_error::erange,
crimson::ct_error::input_output_error
>;
- using submit_record_ret = submit_record_ertr::future<
- record_locator_t
- >;
- virtual submit_record_ret submit_record(
+ using on_submission_func_t = std::function<
+ void(record_locator_t)>;
+ virtual submit_record_ertr::future<> submit_record(
record_t &&record,
- OrderingHandle &handle
- ) = 0;
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission) = 0;
/**
* flush
@@ -101,9 +101,6 @@ public:
virtual replay_ret replay(
delta_handler_t &&delta_handler) = 0;
- virtual seastar::future<> finish_commit(
- transaction_type_t type) = 0;
-
virtual ~Journal() {}
virtual backend_type_t get_type() = 0;
diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc
index 9ee8b1b997f..41ff8318aba 100644
--- a/src/crimson/os/seastore/journal/circular_bounded_journal.cc
+++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc
@@ -58,35 +58,52 @@ CircularBoundedJournal::close_ertr::future<> CircularBoundedJournal::close()
return record_submitter.close();
}
-CircularBoundedJournal::submit_record_ret
+CircularBoundedJournal::submit_record_ertr::future<>
CircularBoundedJournal::submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(CircularBoundedJournal::submit_record);
DEBUG("H{} {} start ...", (void*)&handle, record);
assert(write_pipeline);
- return do_submit_record(std::move(record), handle);
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission)
+ ).safe_then([this, t_src] {
+ if (is_trim_transaction(t_src)) {
+ return update_journal_tail(
+ trimmer.get_dirty_tail(),
+ trimmer.get_alloc_tail());
+ } else {
+ return seastar::now();
+ }
+ });
}
-CircularBoundedJournal::submit_record_ret
+CircularBoundedJournal::submit_record_ertr::future<>
CircularBoundedJournal::do_submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(CircularBoundedJournal::do_submit_record);
if (!record_submitter.is_available()) {
DEBUG("H{} wait ...", (void*)&handle);
return record_submitter.wait_available(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
}
auto action = record_submitter.check_action(record.size);
if (action == RecordSubmitter::action_t::ROLL) {
return record_submitter.roll_segment(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
}
@@ -99,13 +116,16 @@ CircularBoundedJournal::do_submit_record(
return handle.enter(write_pipeline->device_submission
).then([submit_fut=std::move(submit_ret.future)]() mutable {
return std::move(submit_fut);
- }).safe_then([FNAME, this, &handle](record_locator_t result) {
+ }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission)
+ ](record_locator_t result) mutable {
return handle.enter(write_pipeline->finalize
- ).then([FNAME, this, result, &handle] {
+ ).then([FNAME, this, result, &handle,
+ on_submission=std::move(on_submission)] {
DEBUG("H{} finish with {}", (void*)&handle, result);
auto new_committed_to = result.write_result.get_end_seq();
record_submitter.update_committed_to(new_committed_to);
- return result;
+ std::invoke(on_submission, result);
+ return seastar::now();
});
});
}
@@ -392,13 +412,4 @@ Journal::replay_ret CircularBoundedJournal::replay(
});
}
-seastar::future<> CircularBoundedJournal::finish_commit(transaction_type_t type) {
- if (is_trim_transaction(type)) {
- return update_journal_tail(
- trimmer.get_dirty_tail(),
- trimmer.get_alloc_tail());
- }
- return seastar::now();
-}
-
}
diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.h b/src/crimson/os/seastore/journal/circular_bounded_journal.h
index 874bd8dc086..16278df6cfe 100644
--- a/src/crimson/os/seastore/journal/circular_bounded_journal.h
+++ b/src/crimson/os/seastore/journal/circular_bounded_journal.h
@@ -80,9 +80,11 @@ public:
return backend_type_t::RANDOM_BLOCK;
}
- submit_record_ret submit_record(
+ submit_record_ertr::future<> submit_record(
record_t &&record,
- OrderingHandle &handle
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission
) final;
seastar::future<> flush(
@@ -148,8 +150,6 @@ public:
return cjs.get_records_start();
}
- seastar::future<> finish_commit(transaction_type_t type) final;
-
using cbj_delta_handler_t = std::function<
replay_ertr::future<bool>(
const record_locator_t&,
@@ -160,7 +160,10 @@ public:
cbj_delta_handler_t &&delta_handler,
journal_seq_t tail);
- submit_record_ret do_submit_record(record_t &&record, OrderingHandle &handle);
+ submit_record_ertr::future<> do_submit_record(
+ record_t &&record,
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission);
void try_read_rolled_header(scan_valid_records_cursor &cursor) {
paddr_t addr = convert_abs_addr_to_paddr(
diff --git a/src/crimson/os/seastore/journal/record_submitter.cc b/src/crimson/os/seastore/journal/record_submitter.cc
index adf8251b8a7..4976eee96e7 100644
--- a/src/crimson/os/seastore/journal/record_submitter.cc
+++ b/src/crimson/os/seastore/journal/record_submitter.cc
@@ -24,7 +24,7 @@ RecordBatch::add_pending(
LOG_PREFIX(RecordBatch::add_pending);
auto new_size = get_encoded_length_after(record, block_size);
auto dlength_offset = pending.size.dlength;
- TRACE("{} batches={}, write_size={}, dlength_offset={} ...",
+ TRACE("{} batches={}, write_size=0x{:x}, dlength_offset=0x{:x} ...",
name,
pending.get_size() + 1,
new_size.get_encoded_length(),
@@ -144,7 +144,7 @@ RecordSubmitter::RecordSubmitter(
batches(new RecordBatch[io_depth + 1])
{
LOG_PREFIX(RecordSubmitter);
- INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size={}, "
+ INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size=0x{:x}, "
"preferred_fullness={}",
get_name(), io_depth, batch_capacity,
batch_flush_size, preferred_fullness);
diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc
index 11f3cc8fd31..5405662b91e 100644
--- a/src/crimson/os/seastore/journal/segment_allocator.cc
+++ b/src/crimson/os/seastore/journal/segment_allocator.cc
@@ -189,7 +189,7 @@ SegmentAllocator::write(ceph::bufferlist&& to_write)
auto write_length = to_write.length();
auto write_start_offset = written_to;
if (unlikely(LOCAL_LOGGER.is_enabled(seastar::log_level::trace))) {
- TRACE("{} {}~{}", print_name, get_written_to(), write_length);
+ TRACE("{} {}~0x{:x}", print_name, get_written_to(), write_length);
}
assert(write_length > 0);
assert((write_length % get_block_size()) == 0);
@@ -250,7 +250,7 @@ SegmentAllocator::close_segment()
close_seg_info.num_extents};
ceph::bufferlist bl;
encode(tail, bl);
- INFO("{} close segment {}, written_to={}",
+ INFO("{} close segment {}, written_to=0x{:x}",
print_name,
tail,
written_to);
diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc
index eca45f113c2..67c0b3fb8ac 100644
--- a/src/crimson/os/seastore/journal/segmented_journal.cc
+++ b/src/crimson/os/seastore/journal/segmented_journal.cc
@@ -368,25 +368,30 @@ seastar::future<> SegmentedJournal::flush(OrderingHandle &handle)
});
}
-SegmentedJournal::submit_record_ret
+SegmentedJournal::submit_record_ertr::future<>
SegmentedJournal::do_submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(SegmentedJournal::do_submit_record);
if (!record_submitter.is_available()) {
DEBUG("H{} wait ...", (void*)&handle);
return record_submitter.wait_available(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
}
auto action = record_submitter.check_action(record.size);
if (action == RecordSubmitter::action_t::ROLL) {
DEBUG("H{} roll, unavailable ...", (void*)&handle);
return record_submitter.roll_segment(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
} else { // SUBMIT_FULL/NOT_FULL
DEBUG("H{} submit {} ...",
@@ -398,22 +403,27 @@ SegmentedJournal::do_submit_record(
return handle.enter(write_pipeline->device_submission
).then([submit_fut=std::move(submit_ret.future)]() mutable {
return std::move(submit_fut);
- }).safe_then([FNAME, this, &handle](record_locator_t result) {
+ }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission)
+ ](record_locator_t result) mutable {
return handle.enter(write_pipeline->finalize
- ).then([FNAME, this, result, &handle] {
+ ).then([FNAME, this, result, &handle,
+ on_submission=std::move(on_submission)] {
DEBUG("H{} finish with {}", (void*)&handle, result);
auto new_committed_to = result.write_result.get_end_seq();
record_submitter.update_committed_to(new_committed_to);
- return result;
+ std::invoke(on_submission, result);
+ return seastar::now();
});
});
}
}
-SegmentedJournal::submit_record_ret
+SegmentedJournal::submit_record_ertr::future<>
SegmentedJournal::submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(SegmentedJournal::submit_record);
DEBUG("H{} {} start ...", (void*)&handle, record);
@@ -424,12 +434,13 @@ SegmentedJournal::submit_record(
).get_encoded_length();
auto max_record_length = journal_segment_allocator.get_max_write_length();
if (expected_size > max_record_length) {
- ERROR("H{} {} exceeds max record size {}",
+ ERROR("H{} {} exceeds max record size 0x{:x}",
(void*)&handle, record, max_record_length);
return crimson::ct_error::erange::make();
}
- return do_submit_record(std::move(record), handle);
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
}
}
diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h
index 891de7ec306..3f51de70fb3 100644
--- a/src/crimson/os/seastore/journal/segmented_journal.h
+++ b/src/crimson/os/seastore/journal/segmented_journal.h
@@ -44,9 +44,11 @@ public:
close_ertr::future<> close() final;
- submit_record_ret submit_record(
+ submit_record_ertr::future<> submit_record(
record_t &&record,
- OrderingHandle &handle) final;
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission) final;
seastar::future<> flush(OrderingHandle &handle) final;
@@ -59,9 +61,6 @@ public:
backend_type_t get_type() final {
return backend_type_t::SEGMENTED;
}
- seastar::future<> finish_commit(transaction_type_t type) {
- return seastar::now();
- }
bool is_checksum_needed() final {
// segmented journal always requires checksum
@@ -69,10 +68,10 @@ public:
}
private:
- submit_record_ret do_submit_record(
+ submit_record_ertr::future<> do_submit_record(
record_t &&record,
- OrderingHandle &handle
- );
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission);
SegmentSeqAllocatorRef segment_seq_allocator;
SegmentAllocator journal_segment_allocator;
diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h
index a050b2cdf47..9a34bf56157 100644
--- a/src/crimson/os/seastore/lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager.h
@@ -19,6 +19,7 @@
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_mapping.h"
namespace crimson::os::seastore {
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
index b7a1d8f8ba9..888d3c359ac 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -52,28 +52,22 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
ceph_assert(lba_root->is_initial_pending()
== root_block->is_pending());
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, lba_root)};
} else if (root_block->is_pending()) {
auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
lba_root = prior.lba_root_node;
if (lba_root) {
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, lba_root)};
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
}
@@ -100,6 +94,45 @@ void unlink_phy_tree_root_node<laddr_t>(RootBlockRef &root_block) {
namespace crimson::os::seastore::lba_manager::btree {
+get_child_ret_t<LogicalCachedExtent>
+BtreeLBAMapping::get_logical_extent(Transaction &t)
+{
+ ceph_assert(is_parent_viewable());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id());
+ auto &p = static_cast<LBALeafNode&>(*parent);
+ auto k = this->is_indirect()
+ ? this->get_intermediate_base()
+ : get_key();
+ auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k);
+ if (!v.has_child()) {
+ this->child_pos = v.get_child_pos();
+ }
+ return v;
+}
+
+bool BtreeLBAMapping::is_stable() const
+{
+ assert(!this->parent_modified());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ auto &p = static_cast<LBALeafNode&>(*parent);
+ auto k = this->is_indirect()
+ ? this->get_intermediate_base()
+ : get_key();
+ return p.is_child_stable(ctx, pos, k);
+}
+
+bool BtreeLBAMapping::is_data_stable() const
+{
+ assert(!this->parent_modified());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ auto &p = static_cast<LBALeafNode&>(*parent);
+ auto k = this->is_indirect()
+ ? this->get_intermediate_base()
+ : get_key();
+ return p.is_child_data_stable(ctx, pos, k);
+}
+
BtreeLBAManager::mkfs_ret
BtreeLBAManager::mkfs(
Transaction &t)
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
index ef10ff9623b..e0902053d0e 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -23,11 +23,15 @@
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
#include "crimson/os/seastore/btree/btree_range_pin.h"
+namespace crimson::os::seastore {
+class LogicalCachedExtent;
+}
+
namespace crimson::os::seastore::lba_manager::btree {
struct LBALeafNode;
-class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
+class BtreeLBAMapping : public LBAMapping {
// To support cloning, there are two kinds of lba mappings:
// 1. physical lba mapping: the pladdr in the value of which is the paddr of
// the corresponding extent;
@@ -61,14 +65,14 @@ class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
// their keys.
public:
BtreeLBAMapping(op_context_t<laddr_t> ctx)
- : BtreeNodeMapping(ctx) {}
+ : LBAMapping(ctx) {}
BtreeLBAMapping(
op_context_t<laddr_t> c,
LBALeafNodeRef parent,
uint16_t pos,
lba_map_val_t &val,
lba_node_meta_t meta)
- : BtreeNodeMapping(
+ : LBAMapping(
c,
parent,
pos,
@@ -190,8 +194,12 @@ public:
SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin));
return new_pin;
}
+ bool is_stable() const final;
+ bool is_data_stable() const final;
+ get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction &t);
+
protected:
- std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
+ LBAMappingRef _duplicate(
op_context_t<laddr_t> ctx) const final {
auto pin = std::unique_ptr<BtreeLBAMapping>(new BtreeLBAMapping(ctx));
pin->key = key;
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
index ad5d336815b..524bf23dd58 100644
--- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
@@ -259,7 +259,7 @@ struct LBALeafNode
}
// See LBAInternalNode, same concept
- void resolve_relative_addrs(paddr_t base);
+ void resolve_relative_addrs(paddr_t base) final;
void node_resolve_vals(
internal_iterator_t from,
internal_iterator_t to) const final
diff --git a/src/crimson/os/seastore/lba_mapping.cc b/src/crimson/os/seastore/lba_mapping.cc
new file mode 100644
index 00000000000..90fae09ce21
--- /dev/null
+++ b/src/crimson/os/seastore/lba_mapping.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "lba_mapping.h"
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
+{
+ out << "LBAMapping(" << rhs.get_key()
+ << "~0x" << std::hex << rhs.get_length() << std::dec
+ << "->" << rhs.get_val();
+ if (rhs.is_indirect()) {
+ out << ",indirect(" << rhs.get_intermediate_base()
+ << "~0x" << std::hex << rhs.get_intermediate_length()
+ << "@0x" << rhs.get_intermediate_offset() << std::dec
+ << ")";
+ }
+ out << ")";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+{
+ bool first = true;
+ out << '[';
+ for (const auto &i: rhs) {
+ out << (first ? "" : ",") << *i;
+ first = false;
+ }
+ return out << ']';
+}
+
+LBAMappingRef LBAMapping::duplicate() const {
+ auto ret = _duplicate(ctx);
+ ret->range = range;
+ ret->value = value;
+ ret->parent = parent;
+ ret->len = len;
+ ret->pos = pos;
+ return ret;
+}
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h
new file mode 100644
index 00000000000..338d4d53f55
--- /dev/null
+++ b/src/crimson/os/seastore/lba_mapping.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore {
+
+class LBAMapping;
+using LBAMappingRef = std::unique_ptr<LBAMapping>;
+
+class LogicalCachedExtent;
+
+class LBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
+public:
+ LBAMapping(op_context_t<laddr_t> ctx)
+ : BtreeNodeMapping<laddr_t, paddr_t>(ctx) {}
+ template <typename... T>
+ LBAMapping(T&&... t)
+ : BtreeNodeMapping<laddr_t, paddr_t>(std::forward<T>(t)...)
+ {
+ if (!parent->is_pending()) {
+ this->child_pos = {parent, pos};
+ }
+ }
+
+ // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
+ virtual bool is_indirect() const = 0;
+ virtual laddr_t get_intermediate_key() const = 0;
+ virtual laddr_t get_intermediate_base() const = 0;
+ virtual extent_len_t get_intermediate_length() const = 0;
+ // The start offset of the pin, must be 0 if the pin is not indirect
+ virtual extent_len_t get_intermediate_offset() const = 0;
+
+ virtual get_child_ret_t<LogicalCachedExtent>
+ get_logical_extent(Transaction &t) = 0;
+
+ void link_child(ChildableCachedExtent *c) {
+ ceph_assert(child_pos);
+ child_pos->link_child(c);
+ }
+ virtual LBAMappingRef refresh_with_pending_parent() = 0;
+
+ // For reserved mappings, the return values are
+ // undefined although it won't crash
+ virtual bool is_stable() const = 0;
+ virtual bool is_data_stable() const = 0;
+ virtual bool is_clone() const = 0;
+ bool is_zero_reserved() const {
+ return !get_val().is_real();
+ }
+
+ LBAMappingRef duplicate() const;
+
+ virtual ~LBAMapping() {}
+protected:
+ virtual LBAMappingRef _duplicate(op_context_t<laddr_t>) const = 0;
+ std::optional<child_pos_t> child_pos = std::nullopt;
+};
+
+std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs);
+using lba_pin_list_t = std::list<LBAMappingRef>;
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
+
+} // namespace crimson::os::seastore
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc
index 20f86da5d3d..8f817a521cf 100644
--- a/src/crimson/os/seastore/object_data_handler.cc
+++ b/src/crimson/os/seastore/object_data_handler.cc
@@ -50,6 +50,8 @@ struct extent_to_write_t {
extent_to_write_t(const extent_to_write_t &) = delete;
extent_to_write_t(extent_to_write_t &&) = default;
+ extent_to_write_t& operator=(const extent_to_write_t&) = delete;
+ extent_to_write_t& operator=(extent_to_write_t&&) = default;
bool is_data() const {
return type == type_t::DATA;
@@ -523,7 +525,7 @@ ObjectDataHandler::write_ret do_insertions(
if (region.is_data()) {
assert_aligned(region.len);
ceph_assert(region.len == region.bl->length());
- DEBUGT("allocating extent: {}~{}",
+ DEBUGT("allocating extent: {}~0x{:x}",
ctx.t,
region.addr,
region.len);
@@ -554,7 +556,7 @@ ObjectDataHandler::write_ret do_insertions(
ObjectDataHandler::write_iertr::pass_further{}
);
} else if (region.is_zero()) {
- DEBUGT("reserving: {}~{}",
+ DEBUGT("reserving: {}~0x{:x}",
ctx.t,
region.addr,
region.len);
@@ -696,7 +698,7 @@ public:
<< ", aligned_data_end=" << overwrite_plan.aligned_data_end
<< ", left_operation=" << overwrite_plan.left_operation
<< ", right_operation=" << overwrite_plan.right_operation
- << ", block_size=" << overwrite_plan.block_size
+ << ", block_size=0x" << std::hex << overwrite_plan.block_size << std::dec
<< ", is_left_fresh=" << overwrite_plan.is_left_fresh
<< ", is_right_fresh=" << overwrite_plan.is_right_fresh
<< ")";
@@ -827,7 +829,7 @@ namespace crimson::os::seastore {
*/
using operate_ret_bare = std::pair<
std::optional<extent_to_write_t>,
- std::optional<bufferptr>>;
+ std::optional<ceph::bufferlist>>;
using operate_ret = get_iertr::future<operate_ret_bare>;
operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan)
{
@@ -839,19 +841,26 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan
if (overwrite_plan.left_operation == overwrite_operation_t::OVERWRITE_ZERO) {
assert(pin->get_val().is_zero());
+
auto zero_extent_len = overwrite_plan.get_left_extent_size();
assert_aligned(zero_extent_len);
+ std::optional<extent_to_write_t> extent_to_write;
+ if (zero_extent_len != 0) {
+ extent_to_write = extent_to_write_t::create_zero(
+ overwrite_plan.pin_begin, zero_extent_len);
+ }
+
auto zero_prepend_len = overwrite_plan.get_left_alignment_size();
+ std::optional<ceph::bufferlist> prepend_bl;
+ if (zero_prepend_len != 0) {
+ ceph::bufferlist zero_bl;
+ zero_bl.append_zero(zero_prepend_len);
+ prepend_bl = std::move(zero_bl);
+ }
+
return get_iertr::make_ready_future<operate_ret_bare>(
- (zero_extent_len == 0
- ? std::nullopt
- : std::make_optional(extent_to_write_t::create_zero(
- overwrite_plan.pin_begin, zero_extent_len))),
- (zero_prepend_len == 0
- ? std::nullopt
- : std::make_optional(bufferptr(
- ceph::buffer::create(zero_prepend_len, 0))))
- );
+ std::move(extent_to_write),
+ std::move(prepend_bl));
} else if (overwrite_plan.left_operation == overwrite_operation_t::MERGE_EXISTING) {
auto prepend_len = overwrite_plan.get_left_size();
if (prepend_len == 0) {
@@ -859,16 +868,15 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan
std::nullopt,
std::nullopt);
} else {
- extent_len_t off = pin->get_intermediate_offset();
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
- ).si_then([prepend_len, off](auto left_extent) {
+ ).si_then([prepend_len](auto maybe_indirect_left_extent) {
+ auto read_bl = maybe_indirect_left_extent.get_bl();
+ ceph::bufferlist prepend_bl;
+ prepend_bl.substr_of(read_bl, 0, prepend_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::nullopt,
- std::make_optional(bufferptr(
- left_extent->get_bptr(),
- off,
- prepend_len)));
+ std::move(prepend_bl));
});
}
} else {
@@ -888,18 +896,17 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan
std::move(left_to_write_extent),
std::nullopt);
} else {
- extent_len_t off = pin->get_intermediate_offset();
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
- ).si_then([prepend_offset=extent_len + off, prepend_len,
+ ).si_then([prepend_offset=extent_len, prepend_len,
left_to_write_extent=std::move(left_to_write_extent)]
- (auto left_extent) mutable {
+ (auto left_maybe_indirect_extent) mutable {
+ auto read_bl = left_maybe_indirect_extent.get_bl();
+ ceph::bufferlist prepend_bl;
+ prepend_bl.substr_of(read_bl, prepend_offset, prepend_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::move(left_to_write_extent),
- std::make_optional(bufferptr(
- left_extent->get_bptr(),
- prepend_offset,
- prepend_len)));
+ std::move(prepend_bl));
});
}
}
@@ -922,19 +929,26 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla
assert(overwrite_plan.data_end >= right_pin_begin);
if (overwrite_plan.right_operation == overwrite_operation_t::OVERWRITE_ZERO) {
assert(pin->get_val().is_zero());
+
auto zero_suffix_len = overwrite_plan.get_right_alignment_size();
+ std::optional<ceph::bufferlist> suffix_bl;
+ if (zero_suffix_len != 0) {
+ ceph::bufferlist zero_bl;
+ zero_bl.append_zero(zero_suffix_len);
+ suffix_bl = std::move(zero_bl);
+ }
+
auto zero_extent_len = overwrite_plan.get_right_extent_size();
assert_aligned(zero_extent_len);
+ std::optional<extent_to_write_t> extent_to_write;
+ if (zero_extent_len != 0) {
+ extent_to_write = extent_to_write_t::create_zero(
+ overwrite_plan.aligned_data_end, zero_extent_len);
+ }
+
return get_iertr::make_ready_future<operate_ret_bare>(
- (zero_extent_len == 0
- ? std::nullopt
- : std::make_optional(extent_to_write_t::create_zero(
- overwrite_plan.aligned_data_end, zero_extent_len))),
- (zero_suffix_len == 0
- ? std::nullopt
- : std::make_optional(bufferptr(
- ceph::buffer::create(zero_suffix_len, 0))))
- );
+ std::move(extent_to_write),
+ std::move(suffix_bl));
} else if (overwrite_plan.right_operation == overwrite_operation_t::MERGE_EXISTING) {
auto append_len = overwrite_plan.get_right_size();
if (append_len == 0) {
@@ -944,17 +958,17 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla
} else {
auto append_offset =
overwrite_plan.data_end.get_byte_distance<
- extent_len_t>(right_pin_begin)
- + pin->get_intermediate_offset();
+ extent_len_t>(right_pin_begin);
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
- ).si_then([append_offset, append_len](auto right_extent) {
+ ).si_then([append_offset, append_len]
+ (auto right_maybe_indirect_extent) {
+ auto read_bl = right_maybe_indirect_extent.get_bl();
+ ceph::bufferlist suffix_bl;
+ suffix_bl.substr_of(read_bl, append_offset, append_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::nullopt,
- std::make_optional(bufferptr(
- right_extent->get_bptr(),
- append_offset,
- append_len)));
+ std::move(suffix_bl));
});
}
} else {
@@ -976,19 +990,18 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla
} else {
auto append_offset =
overwrite_plan.data_end.get_byte_distance<
- extent_len_t>(right_pin_begin)
- + pin->get_intermediate_offset();
+ extent_len_t>(right_pin_begin);
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
).si_then([append_offset, append_len,
right_to_write_extent=std::move(right_to_write_extent)]
- (auto right_extent) mutable {
+ (auto maybe_indirect_right_extent) mutable {
+ auto read_bl = maybe_indirect_right_extent.get_bl();
+ ceph::bufferlist suffix_bl;
+ suffix_bl.substr_of(read_bl, append_offset, append_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::move(right_to_write_extent),
- std::make_optional(bufferptr(
- right_extent->get_bptr(),
- append_offset,
- append_len)));
+ std::move(suffix_bl));
});
}
}
@@ -1046,13 +1059,13 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
ceph_assert(size <= max_object_size);
if (!object_data.is_null()) {
ceph_assert(object_data.get_reserved_data_len() == max_object_size);
- DEBUGT("reservation present: {}~{}",
+ DEBUGT("reservation present: {}~0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len());
return write_iertr::now();
} else {
- DEBUGT("reserving: {}~{}",
+ DEBUGT("reserving: {}~0x{:x}",
ctx.t,
ctx.onode.get_data_hint(),
max_object_size);
@@ -1085,7 +1098,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
auto data_base = object_data.get_reserved_data_base();
auto data_len = object_data.get_reserved_data_len();
- DEBUGT("object_data: {}~{}", ctx.t, data_base, data_len);
+ DEBUGT("object_data: {}~0x{:x}", ctx.t, data_base, data_len);
laddr_t aligned_start = (data_base + size).get_aligned_laddr();
loffset_t aligned_length =
data_len - aligned_start.get_byte_distance<loffset_t>(data_base);
@@ -1121,7 +1134,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
if (append_len == 0) {
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
TRACET("First pin overlaps the boundary and has aligned data"
- "create existing at addr:{}, len:{}",
+ "create existing at addr:{}, len:0x{:x}",
ctx.t, pin.get_key(), size - pin_offset);
to_write.push_back(extent_to_write_t::create_existing(
pin.duplicate(),
@@ -1136,22 +1149,18 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
ctx.t,
pin.duplicate()
).si_then([ctx, size, pin_offset, append_len, roundup_size,
- &pin, &object_data, &to_write](auto extent) {
- bufferlist bl;
- bl.append(
- bufferptr(
- extent->get_bptr(),
- pin.get_intermediate_offset(),
- size - pin_offset
- ));
- bl.append_zero(append_len);
+ &pin, &object_data, &to_write](auto maybe_indirect_extent) {
+ auto read_bl = maybe_indirect_extent.get_bl();
+ ceph::bufferlist write_bl;
+ write_bl.substr_of(read_bl, 0, size - pin_offset);
+ write_bl.append_zero(append_len);
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
TRACET("First pin overlaps the boundary and has unaligned data"
- "create data at addr:{}, len:{}",
- ctx.t, pin.get_key(), bl.length());
+ "create data at addr:{}, len:0x{:x}",
+ ctx.t, pin.get_key(), write_bl.length());
to_write.push_back(extent_to_write_t::create_data(
pin.get_key(),
- bl));
+ write_bl));
to_write.push_back(extent_to_write_t::create_zero(
(object_data.get_reserved_data_base() + roundup_size).checked_to_laddr(),
object_data.get_reserved_data_len() - roundup_size));
@@ -1184,44 +1193,45 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
* get_to_writes_with_zero_buffer
*
* Returns extent_to_write_t's reflecting a zero region extending
- * from offset~len with headptr optionally on the left and tailptr
+ * from offset~len with headbl optionally on the left and tailbl
* optionally on the right.
*/
extent_to_write_list_t get_to_writes_with_zero_buffer(
laddr_t data_base,
const extent_len_t block_size,
objaddr_t offset, extent_len_t len,
- std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr)
+ std::optional<ceph::bufferlist> &&headbl,
+ std::optional<ceph::bufferlist> &&tailbl)
{
auto zero_left = p2roundup(offset, (objaddr_t)block_size);
auto zero_right = p2align(offset + len, (objaddr_t)block_size);
- auto left = headptr ? (offset - headptr->length()) : offset;
- auto right = tailptr ?
- (offset + len + tailptr->length()) :
+ auto left = headbl ? (offset - headbl->length()) : offset;
+ auto right = tailbl ?
+ (offset + len + tailbl->length()) :
(offset + len);
assert(
- (headptr && ((zero_left - left) ==
- p2roundup(headptr->length(), block_size))) ^
- (!headptr && (zero_left == left)));
+ (headbl && ((zero_left - left) ==
+ p2roundup(headbl->length(), block_size))) ^
+ (!headbl && (zero_left == left)));
assert(
- (tailptr && ((right - zero_right) ==
- p2roundup(tailptr->length(), block_size))) ^
- (!tailptr && (right == zero_right)));
+ (tailbl && ((right - zero_right) ==
+ p2roundup(tailbl->length(), block_size))) ^
+ (!tailbl && (right == zero_right)));
assert(right > left);
// zero region too small for a reserved section,
- // headptr and tailptr in same extent
+ // headbl and tailbl in same extent
if (zero_right <= zero_left) {
bufferlist bl;
- if (headptr) {
- bl.append(*headptr);
+ if (headbl) {
+ bl.append(*headbl);
}
bl.append_zero(
- right - left - bl.length() - (tailptr ? tailptr->length() : 0));
- if (tailptr) {
- bl.append(*tailptr);
+ right - left - bl.length() - (tailbl ? tailbl->length() : 0));
+ if (tailbl) {
+ bl.append(*tailbl);
}
assert(bl.length() % block_size == 0);
assert(bl.length() == (right - left));
@@ -1230,16 +1240,16 @@ extent_to_write_list_t get_to_writes_with_zero_buffer(
(data_base + left).checked_to_laddr(), bl));
return ret;
} else {
- // reserved section between ends, headptr and tailptr in different extents
+ // reserved section between ends, headbl and tailbl in different extents
extent_to_write_list_t ret;
- if (headptr) {
- bufferlist headbl;
- headbl.append(*headptr);
- headbl.append_zero(zero_left - left - headbl.length());
- assert(headbl.length() % block_size == 0);
- assert(headbl.length() > 0);
+ if (headbl) {
+ bufferlist head_zero_bl;
+ head_zero_bl.append(*headbl);
+ head_zero_bl.append_zero(zero_left - left - head_zero_bl.length());
+ assert(head_zero_bl.length() % block_size == 0);
+ assert(head_zero_bl.length() > 0);
ret.push_back(extent_to_write_t::create_data(
- (data_base + left).checked_to_laddr(), headbl));
+ (data_base + left).checked_to_laddr(), head_zero_bl));
}
// reserved zero region
ret.push_back(extent_to_write_t::create_zero(
@@ -1247,14 +1257,14 @@ extent_to_write_list_t get_to_writes_with_zero_buffer(
zero_right - zero_left));
assert(ret.back().len % block_size == 0);
assert(ret.back().len > 0);
- if (tailptr) {
- bufferlist tailbl;
- tailbl.append(*tailptr);
- tailbl.append_zero(right - zero_right - tailbl.length());
- assert(tailbl.length() % block_size == 0);
- assert(tailbl.length() > 0);
+ if (tailbl) {
+ bufferlist tail_zero_bl;
+ tail_zero_bl.append(*tailbl);
+ tail_zero_bl.append_zero(right - zero_right - tail_zero_bl.length());
+ assert(tail_zero_bl.length() % block_size == 0);
+ assert(tail_zero_bl.length() > 0);
ret.push_back(extent_to_write_t::create_data(
- (data_base + zero_right).checked_to_laddr(), tailbl));
+ (data_base + zero_right).checked_to_laddr(), tail_zero_bl));
}
return ret;
}
@@ -1293,7 +1303,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
(auto &pins, auto &to_write) mutable
{
LOG_PREFIX(ObjectDataHandler::overwrite);
- DEBUGT("overwrite: {}~{}",
+ DEBUGT("overwrite: 0x{:x}~0x{:x}",
ctx.t,
offset,
len);
@@ -1306,13 +1316,13 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
overwrite_plan
).si_then([ctx, data_base, len, offset, overwrite_plan, bl=std::move(bl),
&to_write, &pins, this](auto p) mutable {
- auto &[left_extent, headptr] = p;
+ auto &[left_extent, headbl] = p;
if (left_extent) {
ceph_assert(left_extent->addr == overwrite_plan.pin_begin);
append_extent_to_write(to_write, std::move(*left_extent));
}
- if (headptr) {
- assert(headptr->length() > 0);
+ if (headbl) {
+ assert(headbl->length() > 0);
}
return operate_right(
ctx,
@@ -1321,19 +1331,19 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
).si_then([ctx, data_base, len, offset,
pin_begin=overwrite_plan.pin_begin,
pin_end=overwrite_plan.pin_end,
- bl=std::move(bl), headptr=std::move(headptr),
+ bl=std::move(bl), headbl=std::move(headbl),
&to_write, &pins, this](auto p) mutable {
- auto &[right_extent, tailptr] = p;
+ auto &[right_extent, tailbl] = p;
if (bl.has_value()) {
auto write_offset = offset;
bufferlist write_bl;
- if (headptr) {
- write_bl.append(*headptr);
- write_offset = write_offset - headptr->length();
+ if (headbl) {
+ write_bl.append(*headbl);
+ write_offset = write_offset - headbl->length();
}
write_bl.claim_append(*bl);
- if (tailptr) {
- write_bl.append(*tailptr);
+ if (tailbl) {
+ write_bl.append(*tailbl);
assert_aligned(write_bl.length());
}
splice_extent_to_write(
@@ -1347,8 +1357,8 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
ctx.tm.get_block_size(),
offset,
len,
- std::move(headptr),
- std::move(tailptr)));
+ std::move(headbl),
+ std::move(tailbl)));
}
if (right_extent) {
ceph_assert(right_extent->get_end_addr() == pin_end);
@@ -1383,7 +1393,7 @@ ObjectDataHandler::zero_ret ObjectDataHandler::zero(
ctx,
[this, ctx, offset, len](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::zero);
- DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}",
+ DEBUGT("zero to 0x{:x}~0x{:x}, object_data: {}~0x{:x}, is_null {}",
ctx.t,
offset,
len,
@@ -1424,7 +1434,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::write(
ctx,
[this, ctx, offset, &bl](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::write);
- DEBUGT("writing to {}~{}, object_data: {}~{}, is_null {}",
+ DEBUGT("writing to 0x{:x}~0x{:x}, object_data: {}~0x{:x}, is_null {}",
ctx.t,
offset,
bl.length(),
@@ -1469,7 +1479,7 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
ctx,
[ctx, obj_offset, len, &ret](const auto &object_data) {
LOG_PREFIX(ObjectDataHandler::read);
- DEBUGT("reading {}~{}",
+ DEBUGT("reading {}~0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len());
@@ -1501,83 +1511,74 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
pins,
[FNAME, ctx, l_start, l_end,
&l_current, &ret](auto &pin) -> read_iertr::future<> {
- auto pin_key = pin->get_key();
- if (l_current == l_start) {
- ceph_assert(l_current >= pin_key);
- } else {
+ auto pin_start = pin->get_key();
+ extent_len_t read_start;
+ extent_len_t read_start_aligned;
+ if (l_current == l_start) { // first pin may skip head
+ ceph_assert(l_current.get_aligned_laddr() >= pin_start);
+ read_start = l_current.template
+ get_byte_distance<extent_len_t>(pin_start);
+ read_start_aligned = p2align(read_start, ctx.tm.get_block_size());
+ } else { // non-first pin must match start
assert(l_current > l_start);
- ceph_assert(l_current == pin_key);
+ ceph_assert(l_current == pin_start);
+ read_start = 0;
+ read_start_aligned = 0;
}
+
ceph_assert(l_current < l_end);
auto pin_len = pin->get_length();
assert(pin_len > 0);
- laddr_offset_t l_pin_end = pin_key + pin_len;
- ceph_assert(l_current < l_pin_end);
- laddr_offset_t l_current_end = std::min(l_pin_end, l_end);
+ laddr_offset_t pin_end = pin_start + pin_len;
+ assert(l_current < pin_end);
+ laddr_offset_t l_current_end = std::min(pin_end, l_end);
+ extent_len_t read_len =
+ l_current_end.get_byte_distance<extent_len_t>(l_current);
+
if (pin->get_val().is_zero()) {
- DEBUGT("got {}~{} from zero-pin {}~{}",
+ DEBUGT("got {}~0x{:x} from zero-pin {}~0x{:x}",
ctx.t,
l_current,
- l_current_end.get_byte_distance<loffset_t>(l_current),
- pin_key,
+ read_len,
+ pin_start,
pin_len);
- ret.append_zero(
- l_current_end.get_byte_distance<
- extent_len_t>(l_current));
+ ret.append_zero(read_len);
l_current = l_current_end;
return seastar::now();
}
// non-zero pin
- bool is_indirect = pin->is_indirect();
- laddr_t e_key;
- extent_len_t e_len;
- extent_len_t e_off;
- if (is_indirect) {
- e_key = pin->get_intermediate_base();
- e_len = pin->get_intermediate_length();
- e_off = pin->get_intermediate_offset();
- DEBUGT("reading {}~{} from indirect-pin {}~{}, direct-pin {}~{}(off={})",
- ctx.t,
- l_current,
- l_current_end.get_byte_distance<extent_len_t>(l_current),
- pin_key,
- pin_len,
- e_key,
- e_len,
- e_off);
- assert(e_key <= pin->get_intermediate_key());
- assert(e_off + pin_len <= e_len);
- } else {
- DEBUGT("reading {}~{} from pin {}~{}",
- ctx.t,
- l_current,
- l_current_end.get_byte_distance<
- extent_len_t>(l_current),
- pin_key,
- pin_len);
- e_key = pin_key;
- e_len = pin_len;
- e_off = 0;
- }
- extent_len_t e_current_off = (l_current + e_off)
- .template get_byte_distance<extent_len_t>(pin_key);
+ laddr_t l_current_end_aligned = l_current_end.get_roundup_laddr();
+ extent_len_t read_len_aligned =
+ l_current_end_aligned.get_byte_distance<extent_len_t>(pin_start);
+ read_len_aligned -= read_start_aligned;
+ extent_len_t unalign_start_offset = read_start - read_start_aligned;
+ DEBUGT("reading {}~0x{:x} from pin {}~0x{:x}",
+ ctx.t,
+ l_current,
+ read_len,
+ pin_start,
+ pin_len);
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t,
- std::move(pin)
+ std::move(pin),
+ read_start_aligned,
+ read_len_aligned
).si_then([&ret, &l_current, l_current_end,
-#ifndef NDEBUG
- e_key, e_len, e_current_off](auto extent) {
-#else
- e_current_off](auto extent) {
-#endif
- assert(e_key == extent->get_laddr());
- assert(e_len == extent->get_length());
- ret.append(
- bufferptr(
- extent->get_bptr(),
- e_current_off,
- l_current_end.get_byte_distance<extent_len_t>(l_current)));
+ read_start_aligned, read_len_aligned,
+ unalign_start_offset, read_len](auto maybe_indirect_extent) {
+ auto aligned_bl = maybe_indirect_extent.get_range(
+ read_start_aligned, read_len_aligned);
+ if (read_len < read_len_aligned) {
+ ceph::bufferlist unaligned_bl;
+ unaligned_bl.substr_of(
+ aligned_bl, unalign_start_offset, read_len);
+ ret.append(std::move(unaligned_bl));
+ } else {
+ assert(read_len == read_len_aligned);
+ assert(unalign_start_offset == 0);
+ ret.append(std::move(aligned_bl));
+ }
l_current = l_current_end;
return seastar::now();
}).handle_error_interruptible(
@@ -1608,7 +1609,7 @@ ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap(
[ctx, obj_offset, len, &ret](const auto &object_data) {
LOG_PREFIX(ObjectDataHandler::fiemap);
DEBUGT(
- "{}~{}, reservation {}~{}",
+ "0x{:x}~0x{:x}, reservation {}~0x{:x}",
ctx.t,
obj_offset,
len,
@@ -1663,7 +1664,7 @@ ObjectDataHandler::truncate_ret ObjectDataHandler::truncate(
ctx,
[this, ctx, offset](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::truncate);
- DEBUGT("truncating {}~{} offset: {}",
+ DEBUGT("truncating {}~0x{:x} offset: 0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len(),
@@ -1706,7 +1707,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents(
laddr_t data_base)
{
LOG_PREFIX(ObjectDataHandler::clone_extents);
- TRACET(" object_data: {}~{}, data_base: {}",
+ TRACET("object_data: {}~0x{:x}, data_base: 0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len(),
@@ -1791,7 +1792,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone(
auto len = object_data.get_reserved_data_len();
object_data.clear();
LOG_PREFIX(ObjectDataHandler::clone);
- DEBUGT("cloned obj reserve_data_base: {}, len {}",
+ DEBUGT("cloned obj reserve_data_base: {}, len 0x{:x}",
ctx.t,
d_object_data.get_reserved_data_base(),
d_object_data.get_reserved_data_len());
@@ -1801,7 +1802,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone(
d_object_data.get_reserved_data_len()
).si_then([&d_object_data, ctx, &object_data, base, len, this] {
LOG_PREFIX("ObjectDataHandler::clone");
- DEBUGT("head obj reserve_data_base: {}, len {}",
+ DEBUGT("head obj reserve_data_base: {}, len 0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len());
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
index 795daeddb11..7c2392731c0 100644
--- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
@@ -48,7 +48,8 @@ struct OMapNode : LogicalCachedExtent {
need_merge(n_merge) {}
};
- OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ explicit OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ explicit OMapNode(extent_len_t length) : LogicalCachedExtent(length) {}
OMapNode(const OMapNode &other)
: LogicalCachedExtent(other) {}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
index 8d06accef1e..df97f394a0d 100644
--- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
@@ -734,23 +734,28 @@ omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth)
{
ceph_assert(depth > 0);
if (depth > 1) {
- return oc.tm.read_extent<OMapInnerNode>(oc.t, laddr,
- OMAP_INNER_BLOCK_SIZE)
- .handle_error_interruptible(
+ return oc.tm.read_extent<OMapInnerNode>(
+ oc.t, laddr, OMAP_INNER_BLOCK_SIZE
+ ).handle_error_interruptible(
omap_load_extent_iertr::pass_further{},
crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
- ).si_then(
- [](auto&& e) {
- return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return seastar::make_ready_future<OMapNodeRef>(
+ std::move(maybe_indirect_extent.extent));
});
} else {
- return oc.tm.read_extent<OMapLeafNode>(oc.t, laddr, OMAP_LEAF_BLOCK_SIZE
+ return oc.tm.read_extent<OMapLeafNode>(
+ oc.t, laddr, OMAP_LEAF_BLOCK_SIZE
).handle_error_interruptible(
omap_load_extent_iertr::pass_further{},
crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
- ).si_then(
- [](auto&& e) {
- return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return seastar::make_ready_future<OMapNodeRef>(
+ std::move(maybe_indirect_extent.extent));
});
}
}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
index a2b51bbb0e1..2267942f035 100644
--- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
@@ -31,10 +31,18 @@ struct OMapInnerNode
StringKVInnerNodeLayout {
using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>;
using internal_iterator_t = const_iterator;
- template <typename... T>
- OMapInnerNode(T&&... t) :
- OMapNode(std::forward<T>(t)...),
- StringKVInnerNodeLayout(get_bptr().c_str()) {}
+
+ explicit OMapInnerNode(ceph::bufferptr &&ptr)
+ : OMapNode(std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with OMapInnerNode(ptr) after on_fully_loaded()
+ explicit OMapInnerNode(extent_len_t length)
+ : OMapNode(length) {}
+ OMapInnerNode(const OMapInnerNode &rhs)
+ : OMapNode(rhs) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
omap_node_meta_t get_node_meta() const final { return get_meta(); }
bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const {
@@ -46,6 +54,10 @@ struct OMapInnerNode
bool extent_is_below_min() const { return below_min(); }
uint32_t get_node_size() { return get_size(); }
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
CachedExtentRef duplicate_for_write(Transaction&) final {
assert(delta_buffer.empty());
return CachedExtentRef(new OMapInnerNode(*this));
@@ -148,10 +160,18 @@ struct OMapLeafNode
using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>;
using internal_iterator_t = const_iterator;
- template <typename... T>
- OMapLeafNode(T&&... t) :
- OMapNode(std::forward<T>(t)...),
- StringKVLeafNodeLayout(get_bptr().c_str()) {}
+
+ explicit OMapLeafNode(ceph::bufferptr &&ptr)
+ : OMapNode(std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with OMapLeafNode(ptr) after on_fully_loaded()
+ explicit OMapLeafNode(extent_len_t length)
+ : OMapNode(length) {}
+ OMapLeafNode(const OMapLeafNode &rhs)
+ : OMapNode(rhs) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
omap_node_meta_t get_node_meta() const final { return get_meta(); }
bool extent_will_overflow(
@@ -164,6 +184,10 @@ struct OMapLeafNode
bool extent_is_below_min() const { return below_min(); }
uint32_t get_node_size() { return get_size(); }
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
CachedExtentRef duplicate_for_write(Transaction&) final {
assert(delta_buffer.empty());
return CachedExtentRef(new OMapLeafNode(*this));
diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
index 72b13fedfb1..3825ebef145 100644
--- a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
+++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
@@ -504,8 +504,13 @@ public:
inner_remove(iter);
}
- StringKVInnerNodeLayout(char *buf) :
- buf(buf) {}
+ StringKVInnerNodeLayout() : buf(nullptr) {}
+
+ void set_layout_buf(char *_buf) {
+ assert(buf == nullptr);
+ assert(_buf != nullptr);
+ buf = _buf;
+ }
uint32_t get_size() const {
ceph_le32 &size = *layout.template Pointer<0>(buf);
@@ -1120,8 +1125,13 @@ public:
leaf_remove(iter);
}
- StringKVLeafNodeLayout(char *buf) :
- buf(buf) {}
+ StringKVLeafNodeLayout() : buf(nullptr) {}
+
+ void set_layout_buf(char *_buf) {
+ assert(buf == nullptr);
+ assert(_buf != nullptr);
+ buf = _buf;
+ }
const_iterator iter_begin() const {
return const_iterator(
diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc
index f3fd6eb18a5..dc8f6e87c8e 100644
--- a/src/crimson/os/seastore/onode.cc
+++ b/src/crimson/os/seastore/onode.cc
@@ -11,7 +11,7 @@ std::ostream& operator<<(std::ostream &out, const Onode &rhs)
auto &layout = rhs.get_layout();
return out << "Onode("
<< "hobj=" << rhs.hobj << ", "
- << "size=" << static_cast<uint32_t>(layout.size)
+ << "size=0x" << std::hex << static_cast<uint32_t>(layout.size) << std::dec
<< ")";
}
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
index 072c57864be..fa2ed65c0f3 100644
--- a/src/crimson/os/seastore/onode.h
+++ b/src/crimson/os/seastore/onode.h
@@ -36,8 +36,8 @@ struct onode_layout_t {
object_data_le_t object_data;
- char oi[MAX_OI_LENGTH];
- char ss[MAX_SS_LENGTH];
+ char oi[MAX_OI_LENGTH] = {0};
+ char ss[MAX_SS_LENGTH] = {0};
} __attribute__((packed));
class Transaction;
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
index 9230051cc50..04b959f767d 100644
--- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
@@ -41,8 +41,10 @@ class SeastoreSuper final: public Super {
class SeastoreNodeExtent final: public NodeExtent {
public:
- SeastoreNodeExtent(ceph::bufferptr &&ptr)
+ explicit SeastoreNodeExtent(ceph::bufferptr &&ptr)
: NodeExtent(std::move(ptr)) {}
+ explicit SeastoreNodeExtent(extent_len_t length)
+ : NodeExtent(length) {}
SeastoreNodeExtent(const SeastoreNodeExtent& other)
: NodeExtent(other) {}
~SeastoreNodeExtent() override = default;
@@ -111,10 +113,14 @@ class SeastoreNodeExtentManager final: public TransactionManagerHandle {
}
}
return tm.read_extent<SeastoreNodeExtent>(t, addr
- ).si_then([addr, &t](auto&& e) -> read_iertr::future<NodeExtentRef> {
+ ).si_then([addr, &t](auto maybe_indirect_extent)
+ -> read_iertr::future<NodeExtentRef> {
+ auto e = maybe_indirect_extent.extent;
SUBTRACET(seastore_onode,
"read {}B at {} -- {}",
t, e->get_length(), e->get_laddr(), *e);
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
assert(e->get_laddr() == addr);
std::ignore = addr;
return read_iertr::make_ready_future<NodeExtentRef>(e);
diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
index 9f6a566d15c..97b7902edf5 100644
--- a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
+++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
@@ -188,10 +188,10 @@ BlockRBManager::write_ertr::future<> BlockRBManager::write(
void BlockRBManager::prefill_fragmented_device()
{
LOG_PREFIX(BlockRBManager::prefill_fragmented_device);
- // the first 2 blocks must be allocated to lba root
+ // the first 3 blocks must be allocated to lba root
// and backref root during mkfs
- for (size_t block = get_block_size() * 2;
- block <= get_size() - get_block_size() * 2;
+ for (size_t block = get_block_size() * 3;
+ block <= get_size() - get_block_size() * 3;
block += get_block_size() * 2) {
DEBUG("marking {}~{} used",
get_start_rbm_addr() + block,
diff --git a/src/crimson/os/seastore/record_scanner.cc b/src/crimson/os/seastore/record_scanner.cc
index 5fab11505ce..172ba77577e 100644
--- a/src/crimson/os/seastore/record_scanner.cc
+++ b/src/crimson/os/seastore/record_scanner.cc
@@ -18,7 +18,7 @@ RecordScanner::scan_valid_records(
{
LOG_PREFIX(RecordScanner::scan_valid_records);
initialize_cursor(cursor);
- DEBUG("starting at {}, budget={}", cursor, budget);
+ DEBUG("starting at {}, budget=0x{:x}", cursor, budget);
auto retref = std::make_unique<size_t>(0);
auto &budget_used = *retref;
return crimson::repeat(
@@ -91,7 +91,7 @@ RecordScanner::scan_valid_records(
}
}().safe_then([=, &budget_used, &cursor] {
if (cursor.is_complete() || budget_used >= budget) {
- DEBUG("finish at {}, budget_used={}, budget={}",
+ DEBUG("finish at {}, budget_used=0x{:x}, budget=0x{:x}",
cursor, budget_used, budget);
return seastar::stop_iteration::yes;
} else {
@@ -112,13 +112,13 @@ RecordScanner::read_validate_record_metadata(
paddr_t start = cursor.seq.offset;
auto block_size = cursor.get_block_size();
if (get_segment_off(cursor.seq.offset) + block_size > get_segment_end_offset(cursor.seq.offset)) {
- DEBUG("failed -- record group header block {}~4096 > segment_size {}",
- start, get_segment_end_offset(cursor.seq.offset));
+ DEBUG("failed -- record group header block {}~0x{:x} > segment_size 0x{:x}",
+ start, block_size, get_segment_end_offset(cursor.seq.offset));
return read_validate_record_metadata_ret(
read_validate_record_metadata_ertr::ready_future_marker{},
std::nullopt);
}
- TRACE("reading record group header block {}~4096", start);
+ TRACE("reading record group header block {}~0x{:x}", start, block_size);
return read(start, block_size
).safe_then([this, FNAME, nonce, block_size, &cursor](bufferptr bptr)
-> read_validate_record_metadata_ret {
@@ -159,7 +159,7 @@ RecordScanner::read_validate_record_metadata(
paddr_t rest_start = cursor.seq.offset.add_offset(block_size);
auto rest_len = header.mdlength - block_size;
- TRACE("reading record group header rest {}~{}", rest_start, rest_len);
+ TRACE("reading record group header rest {}~0x{:x}", rest_start, rest_len);
return read(rest_start, rest_len
).safe_then([header=std::move(header), bl=std::move(bl)
](auto&& bptail) mutable {
@@ -189,7 +189,7 @@ RecordScanner::read_validate_data_ret RecordScanner::read_validate_data(
{
LOG_PREFIX(RecordScanner::read_validate_data);
auto data_addr = record_base.add_offset(header.mdlength);
- TRACE("reading record group data blocks {}~{}", data_addr, header.dlength);
+ TRACE("reading record group data blocks {}~0x{:x}", data_addr, header.dlength);
return read(
data_addr,
header.dlength
@@ -220,7 +220,7 @@ RecordScanner::consume_next_records(
total_length
}
};
- DEBUG("processing {} at {}, budget_used={}",
+ DEBUG("processing {} at {}, budget_used=0x{:x}",
next.header, locator, budget_used);
return handler(
locator,
diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h
index 942434dd596..26b8604500d 100644
--- a/src/crimson/os/seastore/root_block.h
+++ b/src/crimson/os/seastore/root_block.h
@@ -41,7 +41,7 @@ struct RootBlock : CachedExtent {
CachedExtent* lba_root_node = nullptr;
CachedExtent* backref_root_node = nullptr;
- RootBlock() : CachedExtent(zero_length_t()) {};
+ RootBlock() : CachedExtent(root_construct_t()) {};
RootBlock(const RootBlock &rhs)
: CachedExtent(rhs),
diff --git a/src/crimson/os/seastore/root_meta.h b/src/crimson/os/seastore/root_meta.h
new file mode 100644
index 00000000000..edf082f1e38
--- /dev/null
+++ b/src/crimson/os/seastore/root_meta.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+struct RootMetaBlock : LogicalCachedExtent {
+ using meta_t = std::map<std::string, std::string>;
+ using Ref = TCachedExtentRef<RootMetaBlock>;
+ static constexpr size_t SIZE = 4096;
+ static constexpr int MAX_META_LENGTH = 1024;
+
+ explicit RootMetaBlock(ceph::bufferptr &&ptr)
+ : LogicalCachedExtent(std::move(ptr)) {}
+ explicit RootMetaBlock(extent_len_t length)
+ : LogicalCachedExtent(length) {}
+ RootMetaBlock(const RootMetaBlock &rhs)
+ : LogicalCachedExtent(rhs) {}
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ return CachedExtentRef(new RootMetaBlock(*this));
+ }
+
+ static constexpr extent_types_t TYPE = extent_types_t::ROOT_META;
+ extent_types_t get_type() const final {
+ return extent_types_t::ROOT_META;
+ }
+
+ /// dumps root meta as delta
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH);
+ bl.append(bptr);
+ return bl;
+ }
+
+ /// overwrites root
+ void apply_delta(const ceph::bufferlist &_bl) final
+ {
+ assert(_bl.length() == MAX_META_LENGTH);
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str());
+ }
+
+ meta_t get_meta() const {
+ bufferlist bl;
+ bl.append(get_bptr());
+ meta_t ret;
+ auto iter = bl.cbegin();
+ decode(ret, iter);
+ return ret;
+ }
+
+ void set_meta(const meta_t &m) {
+ ceph::bufferlist bl;
+ encode(m, bl);
+ ceph_assert(bl.length() <= MAX_META_LENGTH);
+ bl.rebuild();
+ get_bptr().zero(0, MAX_META_LENGTH);
+ get_bptr().copy_in(0, bl.length(), bl.front().c_str());
+ }
+
+};
+using RootMetaBlockRef = RootMetaBlock::Ref;
+
+} // crimson::os::seastore
+
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::RootMetaBlock>
+ : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index d90edbb20db..6a866cb1f9b 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -408,6 +408,7 @@ SeaStore::Shard::mkfs_managers()
return transaction_manager->with_transaction_intr(
Transaction::src_t::MUTATE,
"mkfs_seastore",
+ CACHE_HINT_TOUCH,
[this](auto& t)
{
LOG_PREFIX(SeaStoreS::mkfs_managers);
@@ -897,9 +898,10 @@ get_ranges(CollectionRef ch,
seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
SeaStore::Shard::list_objects(CollectionRef ch,
- const ghobject_t& start,
- const ghobject_t& end,
- uint64_t limit) const
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit,
+ uint32_t op_flags) const
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -910,13 +912,14 @@ SeaStore::Shard::list_objects(CollectionRef ch,
return seastar::do_with(
RetType(std::vector<ghobject_t>(), start),
std::move(limit),
- [this, ch, start, end](auto& ret, auto& limit) {
- return repeat_eagain([this, ch, start, end, &limit, &ret] {
+ [this, ch, start, end, op_flags](auto& ret, auto& limit) {
+ return repeat_eagain([this, ch, start, end, &limit, &ret, op_flags] {
++(shard_stats.repeat_read_num);
return transaction_manager->with_transaction_intr(
Transaction::src_t::READ,
"list_objects",
+ op_flags,
[this, ch, start, end, &limit, &ret](auto &t)
{
LOG_PREFIX(SeaStoreS::list_objects);
@@ -1054,6 +1057,7 @@ SeaStore::Shard::list_collections()
return transaction_manager->with_transaction_intr(
Transaction::src_t::READ,
"list_collections",
+ CACHE_HINT_TOUCH,
[this, &ret](auto& t)
{
LOG_PREFIX(SeaStoreS::list_collections);
@@ -1137,6 +1141,7 @@ SeaStore::Shard::read(
Transaction::src_t::READ,
"read",
op_type_t::READ,
+ op_flags,
[this, offset, len, op_flags](auto &t, auto &onode) {
return _read(t, onode, offset, len, op_flags);
}).finally([this] {
@@ -1148,7 +1153,8 @@ SeaStore::Shard::read(
SeaStore::Shard::base_errorator::future<bool>
SeaStore::Shard::exists(
CollectionRef c,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
LOG_PREFIX(SeaStoreS::exists);
++(shard_stats.read_num);
@@ -1160,6 +1166,7 @@ SeaStore::Shard::exists(
Transaction::src_t::READ,
"exists",
op_type_t::READ,
+ op_flags,
[FNAME](auto& t, auto&) {
DEBUGT("exists", t);
return seastar::make_ready_future<bool>(true);
@@ -1240,7 +1247,8 @@ SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
SeaStore::Shard::get_attr(
CollectionRef ch,
const ghobject_t& oid,
- std::string_view name) const
+ std::string_view name,
+ uint32_t op_flags) const
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1251,6 +1259,7 @@ SeaStore::Shard::get_attr(
Transaction::src_t::READ,
"get_attr",
op_type_t::GET_ATTR,
+ op_flags,
[this, name](auto &t, auto& onode) {
return _get_attr(t, onode, name);
}).handle_error(
@@ -1296,7 +1305,8 @@ SeaStore::Shard::_get_attrs(
SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
SeaStore::Shard::get_attrs(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1307,6 +1317,7 @@ SeaStore::Shard::get_attrs(
Transaction::src_t::READ,
"get_attrs",
op_type_t::GET_ATTRS,
+ op_flags,
[this](auto &t, auto& onode) {
return _get_attrs(t, onode);
}).handle_error(
@@ -1331,14 +1342,15 @@ seastar::future<struct stat> SeaStore::Shard::_stat(
st.st_blksize = device->get_block_size();
st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
st.st_nlink = 1;
- DEBUGT("oid={}, size={}, blksize={}",
+ DEBUGT("oid={}, size=0x{:x}, blksize=0x{:x}",
t, oid, st.st_size, st.st_blksize);
return seastar::make_ready_future<struct stat>(st);
}
seastar::future<struct stat> SeaStore::Shard::stat(
CollectionRef c,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1349,6 +1361,7 @@ seastar::future<struct stat> SeaStore::Shard::stat(
Transaction::src_t::READ,
"stat",
op_type_t::STAT,
+ op_flags,
[this, oid](auto &t, auto &onode) {
return _stat(t, onode, oid);
}).handle_error(
@@ -1364,9 +1377,10 @@ seastar::future<struct stat> SeaStore::Shard::stat(
SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
SeaStore::Shard::omap_get_header(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
- return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
+ return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY, op_flags);
}
SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
@@ -1389,7 +1403,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
SeaStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t &oid,
- const omap_keys_t &keys)
+ const omap_keys_t &keys,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1400,6 +1415,7 @@ SeaStore::Shard::omap_get_values(
Transaction::src_t::READ,
"omap_get_values",
op_type_t::OMAP_GET_VALUES,
+ op_flags,
[this, keys](auto &t, auto &onode) {
return do_omap_get_values(t, onode, keys);
}).finally([this] {
@@ -1529,7 +1545,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
SeaStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t &oid,
- const std::optional<std::string> &start)
+ const std::optional<std::string> &start,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1540,6 +1557,7 @@ SeaStore::Shard::omap_get_values(
Transaction::src_t::READ,
"omap_get_values2",
op_type_t::OMAP_GET_VALUES2,
+ op_flags,
[this, start](auto &t, auto &onode) {
return do_omap_get_values(t, onode, start);
}).finally([this] {
@@ -1589,7 +1607,8 @@ SeaStore::Shard::fiemap(
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1600,6 +1619,7 @@ SeaStore::Shard::fiemap(
Transaction::src_t::READ,
"fiemap",
op_type_t::READ,
+ op_flags,
[this, off, len](auto &t, auto &onode) {
return _fiemap(t, onode, off, len);
}).finally([this] {
@@ -1640,7 +1660,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
[this, num_bytes](auto &ctx) {
LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks);
return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) {
- DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...",
+ DEBUGT("cid={}, {} operations, 0x{:x} bytes, {} colls, {} objects ...",
t, ctx.ch->get_cid(),
ctx.ext_transaction.get_num_ops(),
num_bytes,
@@ -2677,6 +2697,7 @@ seastar::future<> SeaStore::Shard::write_meta(
return transaction_manager->with_transaction_intr(
Transaction::src_t::MUTATE,
"write_meta",
+ CACHE_HINT_NOCACHE,
[this, &key, &value](auto& t)
{
LOG_PREFIX(SeaStoreS::write_meta);
@@ -2721,6 +2742,13 @@ SeaStore::read_meta(const std::string& key)
);
}
+seastar::future<std::string> SeaStore::get_default_device_class()
+{
+ using crimson::common::get_conf;
+ std::string type = get_conf<std::string>("seastore_main_device_type");
+ return seastar::make_ready_future<std::string>(type);
+}
+
uuid_d SeaStore::Shard::get_fsid() const
{
return device->get_meta().seastore_id;
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index 185072744f2..e2a993b9e20 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -101,7 +101,8 @@ public:
seastar::future<struct stat> stat(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<ceph::bufferlist> read(
CollectionRef c,
@@ -118,32 +119,38 @@ public:
base_errorator::future<bool> exists(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
get_attr_errorator::future<ceph::bufferlist> get_attr(
CollectionRef c,
const ghobject_t& oid,
- std::string_view name) const final;
+ std::string_view name,
+ uint32_t op_flags = 0) const final;
get_attrs_ertr::future<attrs_t> get_attrs(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) final;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) final;
/// Retrieves paged set of values > start (if present)
read_errorator::future<omap_values_paged_t> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) final; ///< @return <done, values> values.empty() iff done
get_attr_errorator::future<bufferlist> omap_get_header(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
/// std::get<1>(ret) returns end if and only if the listing has listed all
/// the items within the range, otherwise it returns the next key to be listed.
@@ -151,7 +158,8 @@ public:
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const final;
+ uint64_t limit,
+ uint32_t op_flags = 0) const final;
seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
@@ -170,7 +178,8 @@ public:
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len) final;
+ uint64_t len,
+ uint32_t op_flags = 0) final;
unsigned get_max_attr_name_length() const final {
return 256;
@@ -191,6 +200,8 @@ public:
seastar::future<> write_meta(const std::string& key,
const std::string& value);
+ seastar::future<std::string> get_default_device_class();
+
store_statfs_t stat() const;
uuid_d get_fsid() const;
@@ -249,7 +260,8 @@ public:
return seastar::do_with(
internal_context_t(
ch, std::move(t),
- transaction_manager->create_transaction(src, tname)),
+ transaction_manager->create_transaction(
+ src, tname, t.get_fadvise_flags())),
std::forward<F>(f),
[this, op_type](auto &ctx, auto &f) {
assert(shard_stats.starting_io_num);
@@ -296,20 +308,22 @@ public:
Transaction::src_t src,
const char* tname,
op_type_t op_type,
+ cache_hint_t cache_hint_flags,
F &&f) const {
auto begin_time = std::chrono::steady_clock::now();
return seastar::do_with(
oid, Ret{}, std::forward<F>(f),
- [this, ch, src, op_type, begin_time, tname
+ [this, ch, src, op_type, begin_time, tname, cache_hint_flags
](auto &oid, auto &ret, auto &f)
{
- return repeat_eagain([&, this, ch, src, tname] {
+ return repeat_eagain([&, this, ch, src, tname, cache_hint_flags] {
assert(src == Transaction::src_t::READ);
++(shard_stats.repeat_read_num);
return transaction_manager->with_transaction_intr(
src,
tname,
+ cache_hint_flags,
[&, this, ch, tname](auto& t)
{
LOG_PREFIX(SeaStoreS::repeat_with_onode);
@@ -567,6 +581,8 @@ public:
seastar::future<std::vector<coll_core_t>> list_collections() final;
+ seastar::future<std::string> get_default_device_class() final;
+
FuturizedStore::Shard& get_sharded_store() final {
return shard_stores.local();
}
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index f379dd0117c..a57f56d4ab4 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
return out << "LADDR_LEAF";
case extent_types_t::ONODE_BLOCK_STAGED:
return out << "ONODE_BLOCK_STAGED";
+ case extent_types_t::ROOT_META:
+ return out << "ROOT_META";
case extent_types_t::OMAP_INNER:
return out << "OMAP_INNER";
case extent_types_t::OMAP_LEAF:
@@ -349,11 +351,11 @@ std::ostream &operator<<(std::ostream &out, const delta_info_t &delta)
<< "type: " << delta.type
<< ", paddr: " << delta.paddr
<< ", laddr: " << delta.laddr
- << ", prev_crc: " << delta.prev_crc
- << ", final_crc: " << delta.final_crc
- << ", length: " << delta.length
+ << ", prev_crc: 0x" << std::hex << delta.prev_crc
+ << ", final_crc: 0x" << delta.final_crc
+ << ", length: 0x" << delta.length << std::dec
<< ", pversion: " << delta.pversion
- << ", ext_seq: " << delta.ext_seq
+ << ", ext_seq: " << segment_seq_printer_t{delta.ext_seq}
<< ", seg_type: " << delta.seg_type
<< ")";
}
@@ -371,7 +373,7 @@ std::ostream &operator<<(std::ostream &out, const extent_info_t &info)
return out << "extent_info_t("
<< "type: " << info.type
<< ", addr: " << info.addr
- << ", len: " << info.len
+ << ", len: 0x" << std::hex << info.len << std::dec
<< ")";
}
@@ -385,7 +387,7 @@ std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
<< " " << rewrite_gen_printer_t{header.generation}
<< ", dirty_tail=" << header.dirty_tail
<< ", alloc_tail=" << header.alloc_tail
- << ", segment_nonce=" << header.segment_nonce
+ << ", segment_nonce=0x" << std::hex << header.segment_nonce << std::dec
<< ", modify_time=" << mod_time_point_printer_t{header.modify_time}
<< ")";
}
@@ -396,7 +398,7 @@ std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail)
<< tail.physical_segment_id
<< " " << tail.type
<< " " << segment_seq_printer_t{tail.segment_seq}
- << ", segment_nonce=" << tail.segment_nonce
+ << ", segment_nonce=0x" << std::hex << tail.segment_nonce << std::dec
<< ", modify_time=" << mod_time_point_printer_t{tail.modify_time}
<< ", num_extents=" << tail.num_extents
<< ")";
@@ -462,8 +464,8 @@ std::ostream &operator<<(std::ostream& out, const record_size_t& rsize)
{
return out << "record_size_t("
<< "record_type=" << rsize.record_type
- << "raw_md=" << rsize.get_raw_mdlength()
- << ", data=" << rsize.dlength
+ << "raw_md=0x" << std::hex << rsize.get_raw_mdlength()
+ << ", data=0x" << rsize.dlength << std::dec
<< ")";
}
@@ -507,11 +509,11 @@ std::ostream& operator<<(std::ostream& out, const record_group_header_t& h)
{
return out << "record_group_header_t("
<< "num_records=" << h.records
- << ", mdlength=" << h.mdlength
- << ", dlength=" << h.dlength
- << ", nonce=" << h.segment_nonce
+ << ", mdlength=0x" << std::hex << h.mdlength
+ << ", dlength=0x" << h.dlength
+ << ", segment_nonce=0x" << h.segment_nonce << std::dec
<< ", committed_to=" << h.committed_to
- << ", data_crc=" << h.data_crc
+ << ", data_crc=0x" << std::hex << h.data_crc << std::dec
<< ")";
}
@@ -554,9 +556,9 @@ std::ostream& operator<<(std::ostream& out, const record_group_size_t& size)
{
return out << "record_group_size_t("
<< "record_type=" << size.record_type
- << "raw_md=" << size.get_raw_mdlength()
- << ", data=" << size.dlength
- << ", block_size=" << size.block_size
+ << "raw_md=0x" << std::hex << size.get_raw_mdlength()
+ << ", data=0x" << size.dlength
+ << ", block_size=0x" << size.block_size << std::dec
<< ", fullness=" << size.get_fullness()
<< ")";
}
@@ -911,7 +913,7 @@ std::ostream& operator<<(std::ostream& out, const write_result_t& w)
{
return out << "write_result_t("
<< "start=" << w.start_seq
- << ", length=" << w.length
+ << ", length=0x" << std::hex << w.length << std::dec
<< ")";
}
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index df5c184e7ab..5930469ca07 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -3,6 +3,7 @@
#pragma once
+#include <deque>
#include <limits>
#include <numeric>
#include <optional>
@@ -14,13 +15,47 @@
#include "include/byteorder.h"
#include "include/denc.h"
+#include "include/encoding.h"
#include "include/buffer.h"
#include "include/intarith.h"
#include "include/interval_set.h"
#include "include/uuid.h"
+#include "include/rados.h"
namespace crimson::os::seastore {
+class cache_hint_t {
+ enum hint_t {
+ TOUCH,
+ NOCACHE
+ };
+public:
+ static constexpr cache_hint_t get_touch() {
+ return hint_t::TOUCH;
+ }
+ static constexpr cache_hint_t get_nocache() {
+ return hint_t::NOCACHE;
+ }
+ cache_hint_t(uint32_t flags) {
+ if (unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) ||
+ unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) {
+ hint = NOCACHE;
+ }
+ }
+ bool operator==(const cache_hint_t &other) const {
+ return hint == other.hint;
+ }
+ bool operator!=(const cache_hint_t &other) const {
+ return hint != other.hint;
+ }
+private:
+ constexpr cache_hint_t(hint_t hint) : hint(hint) {}
+ hint_t hint = hint_t::TOUCH;
+};
+
+inline constexpr cache_hint_t CACHE_HINT_TOUCH = cache_hint_t::get_touch();
+inline constexpr cache_hint_t CACHE_HINT_NOCACHE = cache_hint_t::get_nocache();
+
/* using a special xattr key "omap_header" to store omap header */
const std::string OMAP_HEADER_XATTR_KEY = "omap_header";
@@ -1226,7 +1261,6 @@ constexpr laddr_t L_ADDR_MAX = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX);
constexpr laddr_t L_ADDR_MIN = laddr_t::from_raw_uint(0);
constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX;
constexpr laddr_t L_ADDR_ROOT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 1);
-constexpr laddr_t L_ADDR_LBAT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 2);
struct __attribute__((packed)) laddr_le_t {
ceph_le64 laddr;
@@ -1378,23 +1412,24 @@ enum class extent_types_t : uint8_t {
LADDR_INTERNAL = 1,
LADDR_LEAF = 2,
DINK_LADDR_LEAF = 3, // should only be used for unitttests
- OMAP_INNER = 4,
- OMAP_LEAF = 5,
- ONODE_BLOCK_STAGED = 6,
- COLL_BLOCK = 7,
- OBJECT_DATA_BLOCK = 8,
- RETIRED_PLACEHOLDER = 9,
+ ROOT_META = 4,
+ OMAP_INNER = 5,
+ OMAP_LEAF = 6,
+ ONODE_BLOCK_STAGED = 7,
+ COLL_BLOCK = 8,
+ OBJECT_DATA_BLOCK = 9,
+ RETIRED_PLACEHOLDER = 10,
// the following two types are not extent types,
// they are just used to indicates paddr allocation deltas
- ALLOC_INFO = 10,
- JOURNAL_TAIL = 11,
+ ALLOC_INFO = 11,
+ JOURNAL_TAIL = 12,
// Test Block Types
- TEST_BLOCK = 12,
- TEST_BLOCK_PHYSICAL = 13,
- BACKREF_INTERNAL = 14,
- BACKREF_LEAF = 15,
+ TEST_BLOCK = 13,
+ TEST_BLOCK_PHYSICAL = 14,
+ BACKREF_INTERNAL = 15,
+ BACKREF_LEAF = 16,
// None and the number of valid extent_types_t
- NONE = 16,
+ NONE = 17,
};
using extent_types_le_t = uint8_t;
constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
@@ -1409,12 +1444,12 @@ constexpr bool is_data_type(extent_types_t type) {
}
constexpr bool is_logical_metadata_type(extent_types_t type) {
- return type >= extent_types_t::OMAP_INNER &&
+ return type >= extent_types_t::ROOT_META &&
type <= extent_types_t::COLL_BLOCK;
}
constexpr bool is_logical_type(extent_types_t type) {
- if ((type >= extent_types_t::OMAP_INNER &&
+ if ((type >= extent_types_t::ROOT_META &&
type <= extent_types_t::OBJECT_DATA_BLOCK) ||
type == extent_types_t::TEST_BLOCK) {
assert(is_logical_metadata_type(type) ||
@@ -1466,6 +1501,23 @@ constexpr bool is_physical_type(extent_types_t type) {
}
}
+constexpr bool is_backref_mapped_type(extent_types_t type) {
+ if ((type >= extent_types_t::LADDR_INTERNAL &&
+ type <= extent_types_t::OBJECT_DATA_BLOCK) ||
+ type == extent_types_t::TEST_BLOCK ||
+ type == extent_types_t::TEST_BLOCK_PHYSICAL) {
+ assert(is_logical_type(type) ||
+ is_lba_node(type) ||
+ type == extent_types_t::TEST_BLOCK_PHYSICAL);
+ return true;
+ } else {
+ assert(!is_logical_type(type) &&
+ !is_lba_node(type) &&
+ type != extent_types_t::TEST_BLOCK_PHYSICAL);
+ return false;
+ }
+}
+
constexpr bool is_real_type(extent_types_t type) {
if (type <= extent_types_t::OBJECT_DATA_BLOCK ||
(type >= extent_types_t::TEST_BLOCK &&
@@ -1617,8 +1669,8 @@ struct delta_info_t {
extent_types_t type = extent_types_t::NONE; ///< delta type
paddr_t paddr; ///< physical address
laddr_t laddr = L_ADDR_NULL; ///< logical address
- uint32_t prev_crc = 0;
- uint32_t final_crc = 0;
+ checksum_t prev_crc = 0;
+ checksum_t final_crc = 0;
extent_len_t length = 0; ///< extent length
extent_version_t pversion; ///< prior version
segment_seq_t ext_seq; ///< seq of the extent's segment
@@ -1926,54 +1978,29 @@ using backref_root_t = phy_tree_root_t;
* TODO: generalize this to permit more than one lba_manager implementation
*/
struct __attribute__((packed)) root_t {
- using meta_t = std::map<std::string, std::string>;
-
- static constexpr int MAX_META_LENGTH = 1024;
-
backref_root_t backref_root;
lba_root_t lba_root;
laddr_le_t onode_root;
coll_root_le_t collection_root;
+ laddr_le_t meta;
- char meta[MAX_META_LENGTH];
-
- root_t() {
- set_meta(meta_t{});
- }
+ root_t() = default;
void adjust_addrs_from_base(paddr_t base) {
lba_root.adjust_addrs_from_base(base);
backref_root.adjust_addrs_from_base(base);
}
-
- meta_t get_meta() {
- bufferlist bl;
- bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
- meta_t ret;
- auto iter = bl.cbegin();
- decode(ret, iter);
- return ret;
- }
-
- void set_meta(const meta_t &m) {
- ceph::bufferlist bl;
- encode(m, bl);
- ceph_assert(bl.length() < MAX_META_LENGTH);
- bl.rebuild();
- auto &bptr = bl.front();
- ::memset(meta, 0, MAX_META_LENGTH);
- ::memcpy(meta, bptr.c_str(), bl.length());
- }
};
struct alloc_blk_t {
alloc_blk_t(
- paddr_t paddr,
- laddr_t laddr,
+ const paddr_t& paddr,
+ const laddr_t& laddr,
extent_len_t len,
extent_types_t type)
- : paddr(paddr), laddr(laddr), len(len), type(type)
- {}
+ : paddr(paddr), laddr(laddr), len(len), type(type) {
+ assert(len > 0);
+ }
explicit alloc_blk_t() = default;
@@ -1989,6 +2016,25 @@ struct alloc_blk_t {
denc(v.type, p);
DENC_FINISH(p);
}
+
+ static alloc_blk_t create_alloc(
+ const paddr_t& paddr,
+ const laddr_t& laddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type));
+ assert(laddr != L_ADDR_NULL);
+ return alloc_blk_t(paddr, laddr, len, type);
+ }
+
+ static alloc_blk_t create_retire(
+ const paddr_t& paddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type) ||
+ is_retired_placeholder_type(type));
+ return alloc_blk_t(paddr, L_ADDR_NULL, len, type);
+ }
};
// use absolute address
diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc
index 1be9cce5f6b..3eced41081e 100644
--- a/src/crimson/os/seastore/segment_manager.cc
+++ b/src/crimson/os/seastore/segment_manager.cc
@@ -16,10 +16,10 @@ namespace crimson::os::seastore {
std::ostream& operator<<(std::ostream& out, const block_shard_info_t& sf)
{
out << "("
- << "size=" << sf.size
- << ", segments=" <<sf.segments
- << ", tracker_offset=" <<sf.tracker_offset
- << ", first_segment_offset=" <<sf.first_segment_offset
+ << "size=0x" << std::hex << sf.size << std::dec
+ << ", segments=" << sf.segments
+ << ", tracker_offset=0x" << std::hex << sf.tracker_offset
+ << ", first_segment_offset=0x" << sf.first_segment_offset << std::dec
<<")";
return out;
}
@@ -28,8 +28,8 @@ std::ostream& operator<<(std::ostream& out, const block_sm_superblock_t& sb)
{
out << "superblock("
<< "shard_num=" << sb.shard_num
- << ", segment_size=" << sb.segment_size
- << ", block_size=" << sb.block_size
+ << ", segment_size=0x" << std::hex << sb.segment_size
+ << ", block_size=0x" << sb.block_size << std::dec
<< ", shard_info:";
for (auto &sf : sb.shard_infos) {
out << sf
diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc
index 0500271f81a..7077aad7407 100644
--- a/src/crimson/os/seastore/segment_manager/block.cc
+++ b/src/crimson/os/seastore/segment_manager/block.cc
@@ -60,7 +60,7 @@ static write_ertr::future<> do_write(
{
LOG_PREFIX(block_do_write);
auto len = bptr.length();
- TRACE("{} poffset={}~{} ...",
+ TRACE("{} poffset=0x{:x}~0x{:x} ...",
device_id_printer_t{device_id}, offset, len);
return device.dma_write(
offset,
@@ -68,16 +68,16 @@ static write_ertr::future<> do_write(
len
).handle_exception(
[FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> {
- ERROR("{} poffset={}~{} got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} got error -- {}",
device_id_printer_t{device_id}, offset, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> {
if (result != len) {
- ERROR("{} poffset={}~{} write len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} write len=0x{:x} inconsistent",
device_id_printer_t{device_id}, offset, len, result);
return crimson::ct_error::input_output_error::make();
}
- TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
+ TRACE("{} poffset=0x{:x}~0x{:x} done", device_id_printer_t{device_id}, offset, len);
return write_ertr::now();
});
}
@@ -90,7 +90,7 @@ static write_ertr::future<> do_writev(
size_t block_size)
{
LOG_PREFIX(block_do_writev);
- TRACE("{} poffset={}~{}, {} buffers",
+ TRACE("{} poffset=0x{:x}~0x{:x}, {} buffers",
device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers());
// writev requires each buffer to be aligned to the disks' block
@@ -109,22 +109,22 @@ static write_ertr::future<> do_writev(
auto off = offset + p.offset;
auto len = p.length;
auto& iov = p.iov;
- TRACE("{} poffset={}~{} dma_write ...",
+ TRACE("{} poffset=0x{:x}~0x{:x} dma_write ...",
device_id_printer_t{device_id}, off, len);
return device.dma_write(off, std::move(iov)
).handle_exception(
[FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
{
- ERROR("{} poffset={}~{} dma_write got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write got error -- {}",
device_id_printer_t{device_id}, off, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
if (written != len) {
- ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write len=0x{:x} inconsistent",
device_id_printer_t{device_id}, off, len, written);
return crimson::ct_error::input_output_error::make();
}
- TRACE("{} poffset={}~{} dma_write done",
+ TRACE("{} poffset=0x{:x}~0x{:x} dma_write done",
device_id_printer_t{device_id}, off, len);
return write_ertr::now();
});
@@ -140,7 +140,7 @@ static read_ertr::future<> do_read(
bufferptr &bptr)
{
LOG_PREFIX(block_do_read);
- TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len);
+ TRACE("{} poffset=0x{:x}~0x{:x} ...", device_id_printer_t{device_id}, offset, len);
assert(len <= bptr.length());
return device.dma_read(
offset,
@@ -153,16 +153,16 @@ static read_ertr::future<> do_read(
// once seastar::future<T>::handle_exception() returns seastar::futurize_t<T>
[FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t>
{
- ERROR("{} poffset={}~{} got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} got error -- {}",
device_id_printer_t{device_id}, offset, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> {
if (result != len) {
- ERROR("{} poffset={}~{} read len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} read len=0x{:x} inconsistent",
device_id_printer_t{device_id}, offset, len, result);
return crimson::ct_error::input_output_error::make();
}
- TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
+ TRACE("{} poffset=0x{:x}~0x{:x} done", device_id_printer_t{device_id}, offset, len);
return read_ertr::now();
});
}
@@ -174,7 +174,7 @@ SegmentStateTracker::write_out(
uint64_t offset)
{
LOG_PREFIX(SegmentStateTracker::write_out);
- DEBUG("{} poffset={}~{}",
+ DEBUG("{} poffset=0x{:x}~0x{:x}",
device_id_printer_t{device_id}, offset, bptr.length());
return do_write(device_id, device, offset, bptr);
}
@@ -186,7 +186,7 @@ SegmentStateTracker::read_in(
uint64_t offset)
{
LOG_PREFIX(SegmentStateTracker::read_in);
- DEBUG("{} poffset={}~{}",
+ DEBUG("{} poffset=0x{:x}~0x{:x}",
device_id_printer_t{device_id}, offset, bptr.length());
return do_read(
device_id,
@@ -230,7 +230,7 @@ block_sm_superblock_t make_superblock(
+ i * segments_per_shard * config_segment_size;
}
- INFO("{} disk_size={}, segment_size={}, block_size={}",
+ INFO("{} disk_size=0x{:x}, segment_size=0x{:x}, block_size=0x{:x}",
device_id_printer_t{device_id},
size,
uint64_t(config_segment_size),
@@ -255,7 +255,7 @@ static check_create_device_ret check_create_device(
size_t size)
{
LOG_PREFIX(block_check_create_device);
- INFO("path={}, size={}", path, size);
+ INFO("path={}, size=0x{:x}", path, size);
return seastar::open_file_dma(
path,
seastar::open_flags::exclusive |
@@ -266,7 +266,7 @@ static check_create_device_ret check_create_device(
file,
[size, FNAME, &path](auto &f) -> seastar::future<>
{
- DEBUG("path={} created, truncating to {}", path, size);
+ DEBUG("path={} created, truncating to 0x{:x}", path, size);
ceph_assert(f);
return f.truncate(
size
@@ -318,8 +318,8 @@ open_device_ret open_device(
).then([stat, &path, FNAME](auto file) mutable {
return file.size().then([stat, file, &path, FNAME](auto size) mutable {
stat.size = size;
- INFO("path={} successful, size={}, block_size={}",
- path, stat.size, stat.block_size);
+ INFO("path={} successful, size=0x{:x}, block_size=0x{:x}",
+ path, stat.size, stat.block_size);
return std::make_pair(file, stat);
});
});
@@ -410,19 +410,19 @@ Segment::write_ertr::future<> BlockSegment::write(
{
LOG_PREFIX(BlockSegment::write);
auto paddr = paddr_t::make_seg_paddr(id, offset);
- DEBUG("{} offset={}~{} poffset={} ...",
+ DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} ...",
id, offset, bl.length(), manager.get_offset(paddr));
if (offset < write_pointer ||
offset % manager.superblock.block_size != 0 ||
bl.length() % manager.superblock.block_size != 0) {
- ERROR("{} offset={}~{} poffset={} invalid write",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid write",
id, offset, bl.length(), manager.get_offset(paddr));
return crimson::ct_error::invarg::make();
}
if (offset + bl.length() > manager.superblock.segment_size) {
- ERROR("{} offset={}~{} poffset={} write out of the range {}",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} write out of the range 0x{:x}",
id, offset, bl.length(), manager.get_offset(paddr),
manager.superblock.segment_size);
return crimson::ct_error::enospc::make();
@@ -443,7 +443,7 @@ Segment::close_ertr::future<> BlockSegmentManager::segment_close(
LOG_PREFIX(BlockSegmentManager::segment_close);
auto s_id = id.device_segment_id();
int unused_bytes = get_segment_size() - write_pointer;
- INFO("{} unused_bytes={} ...", id, unused_bytes);
+ INFO("{} unused_bytes=0x{:x} ...", id, unused_bytes);
assert(unused_bytes >= 0);
assert(id.device_id() == get_device_id());
@@ -693,24 +693,24 @@ SegmentManager::read_ertr::future<> BlockSegmentManager::read(
auto s_id = id.device_segment_id();
auto s_off = seg_addr.get_segment_off();
auto p_off = get_offset(addr);
- DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off);
+ DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} ...", id, s_off, len, p_off);
assert(addr.get_device_id() == get_device_id());
if (s_off % superblock.block_size != 0 ||
len % superblock.block_size != 0) {
- ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off);
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid read", id, s_off, len, p_off);
return crimson::ct_error::invarg::make();
}
if (s_id >= get_num_segments()) {
- ERROR("{} offset={}~{} poffset={} segment-id out of range {}",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} segment-id out of range {}",
id, s_off, len, p_off, get_num_segments());
return crimson::ct_error::invarg::make();
}
if (s_off + len > superblock.segment_size) {
- ERROR("{} offset={}~{} poffset={} read out of range {}",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} read out of range 0x{:x}",
id, s_off, len, p_off, superblock.segment_size);
return crimson::ct_error::invarg::make();
}
@@ -718,7 +718,7 @@ SegmentManager::read_ertr::future<> BlockSegmentManager::read(
if (tracker->get(s_id) == segment_state_t::EMPTY) {
// XXX: not an error during scanning,
// might need refactor to increase the log level
- DEBUG("{} offset={}~{} poffset={} invalid state {}",
+ DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid state {}",
id, s_off, len, p_off, tracker->get(s_id));
return crimson::ct_error::enoent::make();
}
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc
index 4a4873afb94..bdd97e88733 100644
--- a/src/crimson/os/seastore/segment_manager/ephemeral.cc
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc
@@ -20,8 +20,11 @@ namespace {
namespace crimson::os::seastore::segment_manager {
std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) {
- return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size
- << ", segment_size=" << c.segment_size << ")";
+ return lhs << "ephemeral_config_t(size=0x"
+ << std::hex << c.size
+ << ", block_size=0x" << c.block_size
+ << ", segment_size=0x" << c.segment_size
+ << std::dec << ")";
}
EphemeralSegmentManagerRef create_test_ephemeral() {
@@ -141,7 +144,8 @@ Segment::write_ertr::future<> EphemeralSegmentManager::segment_write(
{
auto& seg_addr = addr.as_seg_paddr();
logger().debug(
- "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}",
+ "segment_write to segment {} at offset 0x{:x}, "
+ "physical offset 0x{:x}, len 0x{:x}, crc 0x{:x}",
seg_addr.get_segment_id(),
seg_addr.get_segment_off(),
get_offset(addr),
@@ -268,7 +272,7 @@ SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
if (seg_addr.get_segment_off() + len > config.segment_size) {
logger().error(
- "EphemeralSegmentManager::read: invalid offset {}~{}!",
+ "EphemeralSegmentManager::read: invalid offset {}~0x{:x}!",
addr,
len);
return crimson::ct_error::invarg::make();
@@ -279,7 +283,8 @@ SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
bufferlist bl;
bl.push_back(out);
logger().debug(
- "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}",
+ "segment_read to segment {} at offset 0x{:x}, "
+ "physical offset 0x{:x}, length 0x{:x}, crc 0x{:x}",
seg_addr.get_segment_id().device_segment_id(),
seg_addr.get_segment_off(),
get_offset(addr),
diff --git a/src/crimson/os/seastore/segment_manager/zbd.cc b/src/crimson/os/seastore/segment_manager/zbd.cc
index 88521a947f8..22efbed5940 100644
--- a/src/crimson/os/seastore/segment_manager/zbd.cc
+++ b/src/crimson/os/seastore/segment_manager/zbd.cc
@@ -56,7 +56,7 @@ static open_device_ret open_device(
path, seastar::follow_symlink::yes
).then([FNAME, mode, &path](auto stat) mutable {
return seastar::open_file_dma(path, mode).then([=](auto file) {
- DEBUG("open of device {} successful, size {}",
+ DEBUG("open of device {} successful, size 0x{:x}",
path,
stat.size);
return std::make_pair(file, stat);
@@ -100,11 +100,12 @@ static zbd_sm_metadata_t make_metadata(
WARN("Ignoring configuration values for device and segment size");
INFO(
- "device size: {}, available size: {}, block size: {}, allocated size: {},"
- " total zones {}, zone size: {}, zone capacity: {},"
- " total segments: {}, zones per segment: {}, segment size: {}"
+ "device size: 0x{:x}, available size: 0x{:x},"
+ " block size: 0x{:x}, allocated size: 0x{:x},"
+ " total zones {}, zone size: 0x{:x}, zone capacity: 0x{:x},"
+ " total segments: {}, zones per segment: {}, segment size: 0x{:x}"
" conv zones: {}, swr zones: {}, per shard segments: {}"
- " per shard available size: {}",
+ " per shard available size: 0x{:x}",
total_size,
available_size,
data.block_size,
@@ -126,8 +127,8 @@ static zbd_sm_metadata_t make_metadata(
shard_infos[i].segments = per_shard_segments;
shard_infos[i].first_segment_offset = zone_size * skipped_zones
+ i * segment_size * per_shard_segments;
- INFO("First segment offset for shard {} is: {}",
- i, shard_infos[i].first_segment_offset);
+ INFO("First segment offset for shard {} is: 0x{:x}",
+ i, shard_infos[i].first_segment_offset);
}
zbd_sm_metadata_t ret = zbd_sm_metadata_t{
@@ -248,7 +249,7 @@ static write_ertr::future<> do_write(
bufferptr &bptr)
{
LOG_PREFIX(ZBDSegmentManager::do_write);
- DEBUG("offset {} len {}",
+ DEBUG("offset 0x{:x} len 0x{:x}",
offset,
bptr.length());
return device.dma_write(
@@ -277,7 +278,7 @@ static write_ertr::future<> do_writev(
size_t block_size)
{
LOG_PREFIX(ZBDSegmentManager::do_writev);
- DEBUG("{} offset {} len {}",
+ DEBUG("{} offset 0x{:x} len 0x{:x}",
device_id_printer_t{device_id}, offset, bl.length());
// writev requires each buffer to be aligned to the disks' block
// size, we need to rebuild here
@@ -295,23 +296,23 @@ static write_ertr::future<> do_writev(
auto off = offset + p.offset;
auto len = p.length;
auto& iov = p.iov;
- DEBUG("{} poffset={}~{} dma_write ...",
+ DEBUG("{} poffset=0x{:x}~0x{:x} dma_write ...",
device_id_printer_t{device_id},
off, len);
return device.dma_write(off, std::move(iov)
).handle_exception(
[FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
{
- ERROR("{} poffset={}~{} dma_write got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write got error -- {}",
device_id_printer_t{device_id}, off, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
if (written != len) {
- ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write len=0x{:x} inconsistent",
device_id_printer_t{device_id}, off, len, written);
return crimson::ct_error::input_output_error::make();
}
- DEBUG("{} poffset={}~{} dma_write done",
+ DEBUG("{} poffset=0x{:x}~0x{:x} dma_write done",
device_id_printer_t{device_id},
off, len);
return write_ertr::now();
@@ -329,12 +330,12 @@ write_metadata(seastar::file &device, zbd_sm_metadata_t sb)
bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
[=, &device](auto &bp) {
LOG_PREFIX(ZBDSegmentManager::write_metadata);
- DEBUG("block_size {}", sb.block_size);
+ DEBUG("block_size 0x{:x}", sb.block_size);
bufferlist bl;
encode(sb, bl);
auto iter = bl.begin();
assert(bl.length() < sb.block_size);
- DEBUG("buffer length {}", bl.length());
+ DEBUG("buffer length 0x{:x}", bl.length());
iter.copy(bl.length(), bp.c_str());
DEBUG("doing writeout");
return do_write(device, 0, bp);
@@ -349,7 +350,7 @@ static read_ertr::future<> do_read(
{
LOG_PREFIX(ZBDSegmentManager::do_read);
assert(len <= bptr.length());
- DEBUG("offset {} len {}",
+ DEBUG("offset 0x{:x} len 0x{:x}",
offset,
len);
return device.dma_read(
@@ -659,7 +660,7 @@ SegmentManager::read_ertr::future<> ZBDSegmentManager::read(
}
if (seg_addr.get_segment_off() + len > metadata.segment_capacity) {
- ERROR("invalid read offset {}, len {}",
+ ERROR("invalid read offset {}, len 0x{:x}",
addr,
len);
return crimson::ct_error::invarg::make();
@@ -703,7 +704,7 @@ Segment::write_ertr::future<> ZBDSegmentManager::segment_write(
assert(addr.get_device_id() == get_device_id());
assert((bl.length() % metadata.block_size) == 0);
auto& seg_addr = addr.as_seg_paddr();
- DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
+ DEBUG("write to segment {} at offset 0x{:x}, physical offset 0x{:x}, len 0x{:x}",
seg_addr.get_segment_id(),
seg_addr.get_segment_off(),
get_offset(addr),
@@ -756,7 +757,7 @@ Segment::write_ertr::future<> ZBDSegment::write(
LOG_PREFIX(ZBDSegment::write);
if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
ERROR("Segment offset and zone write pointer mismatch. "
- "segment {} segment-offset {} write pointer {}",
+ "segment {} segment-offset 0x{:x} write pointer 0x{:x}",
id, offset, write_pointer);
return crimson::ct_error::invarg::make();
}
@@ -772,7 +773,7 @@ Segment::write_ertr::future<> ZBDSegment::write_padding_bytes(
size_t padding_bytes)
{
LOG_PREFIX(ZBDSegment::write_padding_bytes);
- DEBUG("Writing {} padding bytes to segment {} at wp {}",
+ DEBUG("Writing 0x{:x} padding bytes to segment {} at wp 0x{:x}",
padding_bytes, id, write_pointer);
return crimson::repeat([FNAME, padding_bytes, this] () mutable {
@@ -804,7 +805,7 @@ Segment::write_ertr::future<> ZBDSegment::advance_wp(
{
LOG_PREFIX(ZBDSegment::advance_wp);
- DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
+ DEBUG("Advancing write pointer from 0x{:x} to 0x{:x}", write_pointer, offset);
if (offset < write_pointer) {
return crimson::ct_error::invarg::make();
}
diff --git a/src/crimson/os/seastore/segment_manager_group.cc b/src/crimson/os/seastore/segment_manager_group.cc
index 332b794b70e..f4822c9a18c 100644
--- a/src/crimson/os/seastore/segment_manager_group.cc
+++ b/src/crimson/os/seastore/segment_manager_group.cc
@@ -26,13 +26,13 @@ SegmentManagerGroup::read_segment_tail(segment_id_t segment)
}
).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_tail_ret {
LOG_PREFIX(SegmentManagerGroup::read_segment_tail);
- DEBUG("segment {} bptr size {}", segment, bptr.length());
+ DEBUG("segment {} bptr size 0x{:x}", segment, bptr.length());
segment_tail_t tail;
bufferlist bl;
bl.push_back(bptr);
- DEBUG("segment {} block crc {}",
+ DEBUG("segment {} block crc 0x{:x}",
segment,
bl.begin().crc32c(segment_manager.get_block_size(), 0));
@@ -66,13 +66,13 @@ SegmentManagerGroup::read_segment_header(segment_id_t segment)
}
).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_header_ret {
LOG_PREFIX(SegmentManagerGroup::read_segment_header);
- DEBUG("segment {} bptr size {}", segment, bptr.length());
+ DEBUG("segment {} bptr size 0x{:x}", segment, bptr.length());
segment_header_t header;
bufferlist bl;
bl.push_back(bptr);
- DEBUG("segment {} block crc {}",
+ DEBUG("segment {} block crc 0x{:x}",
segment,
bl.begin().crc32c(segment_manager.get_block_size(), 0));
@@ -111,7 +111,7 @@ SegmentManagerGroup::read(paddr_t start, size_t len)
LOG_PREFIX(SegmentManagerGroup::read);
assert(has_device(start.get_device_id()));
auto& segment_manager = *segment_managers[start.get_device_id()];
- TRACE("reading data {}~{}", start, len);
+ TRACE("reading data {}~0x{:x}", start, len);
return segment_manager.read(
start,
len
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 5d8ad00ba22..cd8c333c69f 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -8,16 +8,17 @@
#include <boost/intrusive/list.hpp>
#include "crimson/common/log.h"
+#include "crimson/os/seastore/backref_entry.h"
+#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/logging.h"
#include "crimson/os/seastore/ordering_handle.h"
-#include "crimson/os/seastore/seastore_types.h"
-#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/root_block.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_interruptor.h"
namespace crimson::os::seastore {
class SeaStore;
-class Transaction;
struct io_stat_t {
uint64_t num = 0;
@@ -408,12 +409,14 @@ public:
src_t src,
journal_seq_t initiated_after,
on_destruct_func_t&& f,
- transaction_id_t trans_id
+ transaction_id_t trans_id,
+ cache_hint_t cache_hint
) : weak(weak),
handle(std::move(handle)),
on_destruct(std::move(f)),
src(src),
- trans_id(trans_id)
+ trans_id(trans_id),
+ cache_hint(cache_hint)
{}
void invalidate_clear_write_set() {
@@ -460,6 +463,7 @@ public:
ool_write_stats = {};
rewrite_stats = {};
conflicted = false;
+ assert(backref_entries.empty());
if (!has_reset) {
has_reset = true;
}
@@ -571,10 +575,23 @@ public:
return pre_alloc_list;
}
+ cache_hint_t get_cache_hint() const {
+ return cache_hint;
+ }
+
private:
friend class Cache;
friend Ref make_test_transaction();
+ void set_backref_entries(backref_entry_refs_t&& entries) {
+ assert(backref_entries.empty());
+ backref_entries = std::move(entries);
+ }
+
+ backref_entry_refs_t move_backref_entries() {
+ return std::move(backref_entries);
+ }
+
/**
* If set, *this may not be used to perform writes and will not provide
* consistentency allowing operations using to avoid maintaining a read_set.
@@ -669,6 +686,10 @@ private:
transaction_id_t trans_id = TRANS_ID_NULL;
seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
+
+ backref_entry_refs_t backref_entries;
+
+ cache_hint_t cache_hint = CACHE_HINT_TOUCH;
};
using TransactionRef = Transaction::Ref;
@@ -681,67 +702,11 @@ inline TransactionRef make_test_transaction() {
Transaction::src_t::MUTATE,
JOURNAL_SEQ_NULL,
[](Transaction&) {},
- ++next_id
+ ++next_id,
+ CACHE_HINT_TOUCH
);
}
-struct TransactionConflictCondition {
- class transaction_conflict final : public std::exception {
- public:
- const char* what() const noexcept final {
- return "transaction conflict detected";
- }
- };
-
-public:
- TransactionConflictCondition(Transaction &t) : t(t) {}
-
- template <typename Fut>
- std::optional<Fut> may_interrupt() {
- if (t.conflicted) {
- return seastar::futurize<Fut>::make_exception_future(
- transaction_conflict());
- } else {
- return std::optional<Fut>();
- }
- }
-
- template <typename T>
- static constexpr bool is_interruption_v =
- std::is_same_v<T, transaction_conflict>;
-
-
- static bool is_interruption(std::exception_ptr& eptr) {
- return *eptr.__cxa_exception_type() == typeid(transaction_conflict);
- }
-
-private:
- Transaction &t;
-};
-
-using trans_intr = crimson::interruptible::interruptor<
- TransactionConflictCondition
- >;
-
-template <typename E>
-using trans_iertr =
- crimson::interruptible::interruptible_errorator<
- TransactionConflictCondition,
- E
- >;
-
-template <typename F, typename... Args>
-auto with_trans_intr(Transaction &t, F &&f, Args&&... args) {
- return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>(
- std::move(f),
- TransactionConflictCondition(t),
- t,
- std::forward<Args>(args)...);
-}
-
-template <typename T>
-using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>;
-
}
#if FMT_VERSION >= 90000
diff --git a/src/crimson/os/seastore/transaction_interruptor.cc b/src/crimson/os/seastore/transaction_interruptor.cc
new file mode 100644
index 00000000000..d22f760f2db
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_interruptor.cc
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/transaction_interruptor.h"
+
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore {
+
+bool TransactionConflictCondition::is_conflicted() const
+{
+ return t.conflicted;
+}
+
+}
diff --git a/src/crimson/os/seastore/transaction_interruptor.h b/src/crimson/os/seastore/transaction_interruptor.h
new file mode 100644
index 00000000000..d0522c23c19
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_interruptor.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/interruptible_future.h"
+
+namespace crimson::os::seastore {
+
+class Transaction;
+
+struct TransactionConflictCondition {
+ class transaction_conflict final : public std::exception {
+ public:
+ const char* what() const noexcept final {
+ return "transaction conflict detected";
+ }
+ };
+
+public:
+ TransactionConflictCondition(Transaction &t) : t(t) {}
+
+ template <typename Fut>
+ std::optional<Fut> may_interrupt() {
+ if (is_conflicted()) {
+ return seastar::futurize<Fut>::make_exception_future(
+ transaction_conflict());
+ } else {
+ return std::optional<Fut>();
+ }
+ }
+
+ template <typename T>
+ static constexpr bool is_interruption_v =
+ std::is_same_v<T, transaction_conflict>;
+
+
+ static bool is_interruption(std::exception_ptr& eptr) {
+ return *eptr.__cxa_exception_type() == typeid(transaction_conflict);
+ }
+
+private:
+ bool is_conflicted() const;
+
+ Transaction &t;
+};
+
+using trans_intr = crimson::interruptible::interruptor<
+ TransactionConflictCondition
+ >;
+
+template <typename E>
+using trans_iertr =
+ crimson::interruptible::interruptible_errorator<
+ TransactionConflictCondition,
+ E
+ >;
+
+template <typename F, typename... Args>
+auto with_trans_intr(Transaction &t, F &&f, Args&&... args) {
+ return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>(
+ std::move(f),
+ TransactionConflictCondition(t),
+ t,
+ std::forward<Args>(args)...);
+}
+
+template <typename T>
+using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>;
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index f4e3b0858f2..807d88b2cbc 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -66,6 +66,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
return with_transaction_intr(
Transaction::src_t::MUTATE,
"mkfs_tm",
+ CACHE_HINT_TOUCH,
[this, FNAME](auto& t)
{
cache->init();
@@ -74,6 +75,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
return lba_manager->mkfs(t);
}).si_then([this, &t] {
return backref_manager->mkfs(t);
+ }).si_then([this, &t] {
+ return init_root_meta(t);
}).si_then([this, FNAME, &t] {
INFOT("submitting mkfs transaction", t);
return submit_transaction_direct(t);
@@ -129,6 +132,7 @@ TransactionManager::mount()
journal->get_trimmer().set_journal_head(start_seq);
return with_transaction_weak(
"mount",
+ CACHE_HINT_TOUCH,
[this](auto &t)
{
return cache->init_cached_extents(t, [this](auto &t, auto &e) {
@@ -219,7 +223,7 @@ TransactionManager::ref_ret TransactionManager::inc_ref(
TRACET("{}", t, offset);
return lba_manager->incref_extent(t, offset
).si_then([FNAME, offset, &t](auto result) {
- DEBUGT("extent refcount is incremented to {} -- {}~{}, {}",
+ DEBUGT("extent refcount is incremented to {} -- {}~0x{:x}, {}",
t, result.refcount, offset, result.length, result.addr);
return result.refcount;
});
@@ -459,8 +463,12 @@ TransactionManager::do_submit_transaction(
}
SUBTRACET(seastore_t, "submitting record", tref);
- return journal->submit_record(std::move(record), tref.get_handle()
- ).safe_then([this, FNAME, &tref](auto submit_result) mutable {
+ return journal->submit_record(
+ std::move(record),
+ tref.get_handle(),
+ tref.get_src(),
+ [this, FNAME, &tref](record_locator_t submit_result)
+ {
SUBDEBUGT(seastore_t, "committed with {}", tref, submit_result);
auto start_seq = submit_result.write_result.start_seq;
journal->get_trimmer().set_journal_head(start_seq);
@@ -471,10 +479,8 @@ TransactionManager::do_submit_transaction(
journal->get_trimmer().update_journal_tails(
cache->get_oldest_dirty_from().value_or(start_seq),
cache->get_oldest_backref_dirty_from().value_or(start_seq));
- return journal->finish_commit(tref.get_src()
- ).then([&tref] {
- return tref.get_handle().complete();
- });
+ }).safe_then([&tref] {
+ return tref.get_handle().complete();
}).handle_error(
submit_transaction_iertr::pass_further{},
crimson::ct_error::assert_all{"Hit error submitting to journal"}
@@ -506,7 +512,7 @@ TransactionManager::get_next_dirty_extents(
size_t max_bytes)
{
LOG_PREFIX(TransactionManager::get_next_dirty_extents);
- DEBUGT("max_bytes={}B, seq={}", t, max_bytes, seq);
+ DEBUGT("max_bytes=0x{:x}B, seq={}", t, max_bytes, seq);
return cache->get_next_dirty_extents(t, seq, max_bytes);
}
@@ -521,101 +527,111 @@ TransactionManager::rewrite_logical_extent(
ceph_abort();
}
- auto lextent = extent->cast<LogicalCachedExtent>();
- cache->retire_extent(t, extent);
- if (get_extent_category(lextent->get_type()) == data_category_t::METADATA) {
- auto nlextent = cache->alloc_new_extent_by_type(
+ if (get_extent_category(extent->get_type()) == data_category_t::METADATA) {
+ assert(extent->is_fully_loaded());
+ cache->retire_extent(t, extent);
+ auto nextent = cache->alloc_new_extent_by_type(
t,
- lextent->get_type(),
- lextent->get_length(),
- lextent->get_user_hint(),
+ extent->get_type(),
+ extent->get_length(),
+ extent->get_user_hint(),
// get target rewrite generation
- lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
- nlextent->rewrite(t, *lextent, 0);
+ extent->get_rewrite_generation())->cast<LogicalCachedExtent>();
+ nextent->rewrite(t, *extent, 0);
- DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent);
+ DEBUGT("rewriting meta -- {} to {}", t, *extent, *nextent);
#ifndef NDEBUG
- if (get_checksum_needed(lextent->get_paddr())) {
- assert(lextent->get_last_committed_crc() == lextent->calc_crc32c());
+ if (get_checksum_needed(extent->get_paddr())) {
+ assert(extent->get_last_committed_crc() == extent->calc_crc32c());
} else {
- assert(lextent->get_last_committed_crc() == CRC_NULL);
+ assert(extent->get_last_committed_crc() == CRC_NULL);
}
#endif
- nlextent->set_last_committed_crc(lextent->get_last_committed_crc());
+ nextent->set_last_committed_crc(extent->get_last_committed_crc());
/* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
* extents since we're going to do it again once we either do the ool write
* or allocate a relative inline addr. TODO: refactor AsyncCleaner to
* avoid this complication. */
return lba_manager->update_mapping(
t,
- lextent->get_laddr(),
- lextent->get_length(),
- lextent->get_paddr(),
- nlextent->get_length(),
- nlextent->get_paddr(),
- nlextent->get_last_committed_crc(),
- nlextent.get()).discard_result();
+ extent->get_laddr(),
+ extent->get_length(),
+ extent->get_paddr(),
+ nextent->get_length(),
+ nextent->get_paddr(),
+ nextent->get_last_committed_crc(),
+ nextent.get()
+ ).discard_result();
} else {
- assert(get_extent_category(lextent->get_type()) == data_category_t::DATA);
- auto extents = cache->alloc_new_data_extents_by_type(
- t,
- lextent->get_type(),
- lextent->get_length(),
- lextent->get_user_hint(),
- // get target rewrite generation
- lextent->get_rewrite_generation());
- return seastar::do_with(
- std::move(extents),
- 0,
- lextent->get_length(),
- extent_ref_count_t(0),
- [this, FNAME, lextent, &t]
- (auto &extents, auto &off, auto &left, auto &refcount) {
- return trans_intr::do_for_each(
- extents,
- [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) {
- bool first_extent = (off == 0);
- ceph_assert(left >= nextent->get_length());
- auto nlextent = nextent->template cast<LogicalCachedExtent>();
- nlextent->rewrite(t, *lextent, off);
- DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent);
-
- /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
- * extents since we're going to do it again once we either do the ool write
- * or allocate a relative inline addr. TODO: refactor AsyncCleaner to
- * avoid this complication. */
- auto fut = base_iertr::now();
- if (first_extent) {
- fut = lba_manager->update_mapping(
- t,
- (lextent->get_laddr() + off).checked_to_laddr(),
- lextent->get_length(),
- lextent->get_paddr(),
- nlextent->get_length(),
- nlextent->get_paddr(),
- nlextent->get_last_committed_crc(),
- nlextent.get()
- ).si_then([&refcount](auto c) {
- refcount = c;
- });
- } else {
- ceph_assert(refcount != 0);
- fut = lba_manager->alloc_extent(
- t,
- (lextent->get_laddr() + off).checked_to_laddr(),
- *nlextent,
- refcount
- ).si_then([lextent, nlextent, off](auto mapping) {
- ceph_assert(mapping->get_key() == lextent->get_laddr() + off);
- ceph_assert(mapping->get_val() == nlextent->get_paddr());
+ assert(get_extent_category(extent->get_type()) == data_category_t::DATA);
+ auto length = extent->get_length();
+ return cache->read_extent_maybe_partial(
+ t, std::move(extent), 0, length
+ ).si_then([this, FNAME, &t](auto extent) {
+ assert(extent->is_fully_loaded());
+ cache->retire_extent(t, extent);
+ auto extents = cache->alloc_new_data_extents_by_type(
+ t,
+ extent->get_type(),
+ extent->get_length(),
+ extent->get_user_hint(),
+ // get target rewrite generation
+ extent->get_rewrite_generation());
+ return seastar::do_with(
+ std::move(extents),
+ 0,
+ extent->get_length(),
+ extent_ref_count_t(0),
+ [this, FNAME, extent, &t]
+ (auto &extents, auto &off, auto &left, auto &refcount)
+ {
+ return trans_intr::do_for_each(
+ extents,
+ [extent, this, FNAME, &t, &off, &left, &refcount](auto &_nextent)
+ {
+ auto nextent = _nextent->template cast<LogicalCachedExtent>();
+ bool first_extent = (off == 0);
+ ceph_assert(left >= nextent->get_length());
+ nextent->rewrite(t, *extent, off);
+ DEBUGT("rewriting data -- {} to {}", t, *extent, *nextent);
+
+ /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
+ * extents since we're going to do it again once we either do the ool write
+ * or allocate a relative inline addr. TODO: refactor AsyncCleaner to
+ * avoid this complication. */
+ auto fut = base_iertr::now();
+ if (first_extent) {
+ fut = lba_manager->update_mapping(
+ t,
+ (extent->get_laddr() + off).checked_to_laddr(),
+ extent->get_length(),
+ extent->get_paddr(),
+ nextent->get_length(),
+ nextent->get_paddr(),
+ nextent->get_last_committed_crc(),
+ nextent.get()
+ ).si_then([&refcount](auto c) {
+ refcount = c;
+ });
+ } else {
+ ceph_assert(refcount != 0);
+ fut = lba_manager->alloc_extent(
+ t,
+ (extent->get_laddr() + off).checked_to_laddr(),
+ *nextent,
+ refcount
+ ).si_then([extent, nextent, off](auto mapping) {
+ ceph_assert(mapping->get_key() == extent->get_laddr() + off);
+ ceph_assert(mapping->get_val() == nextent->get_paddr());
+ return seastar::now();
+ });
+ }
+ return fut.si_then([&off, &left, nextent] {
+ off += nextent->get_length();
+ left -= nextent->get_length();
return seastar::now();
});
- }
- return fut.si_then([&off, &left, nlextent] {
- off += nlextent->get_length();
- left -= nlextent->get_length();
- return seastar::now();
});
});
});
@@ -714,7 +730,7 @@ TransactionManager::get_extents_if_live(
ceph_assert(paddr.get_addr_type() == paddr_types_t::SEGMENT);
return cache->get_extent_if_cached(t, paddr, type
- ).si_then([=, this, &t](auto extent)
+ ).si_then([this, FNAME, type, paddr, laddr, len, &t](auto extent)
-> get_extents_if_live_ret {
if (extent && extent->get_length() == len) {
DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}",
@@ -731,19 +747,24 @@ TransactionManager::get_extents_if_live(
t,
laddr,
len
- ).si_then([=, this, &t](lba_pin_list_t pin_list) {
+ ).si_then([this, FNAME, type, paddr, laddr, len, &t](lba_pin_list_t pin_list) {
return seastar::do_with(
std::list<CachedExtentRef>(),
- [=, this, &t, pin_list=std::move(pin_list)](
- std::list<CachedExtentRef> &list) mutable
+ std::move(pin_list),
+ [this, FNAME, type, paddr, laddr, len, &t]
+ (std::list<CachedExtentRef> &extent_list, auto& pin_list)
{
auto paddr_seg_id = paddr.as_seg_paddr().get_segment_id();
return trans_intr::parallel_for_each(
pin_list,
- [=, this, &list, &t](
- LBAMappingRef &pin) -> Cache::get_extent_iertr::future<>
+ [this, FNAME, type, paddr_seg_id, &extent_list, &t](
+ LBAMappingRef& pin) -> Cache::get_extent_iertr::future<>
{
+ DEBUGT("got pin, try read in parallel ... -- {}", t, *pin);
auto pin_paddr = pin->get_val();
+ if (pin_paddr.get_addr_type() != paddr_types_t::SEGMENT) {
+ return seastar::now();
+ }
auto &pin_seg_paddr = pin_paddr.as_seg_paddr();
auto pin_paddr_seg_id = pin_seg_paddr.get_segment_id();
// auto pin_len = pin->get_length();
@@ -767,16 +788,16 @@ TransactionManager::get_extents_if_live(
// ceph_assert(pin_seg_paddr >= paddr &&
// pin_seg_paddr.add_offset(pin_len) <= paddr.add_offset(len));
return read_pin_by_type(t, std::move(pin), type
- ).si_then([&list](auto ret) {
- list.emplace_back(std::move(ret));
+ ).si_then([&extent_list](auto ret) {
+ extent_list.emplace_back(std::move(ret));
return seastar::now();
});
- }).si_then([&list, &t, FNAME, type, laddr, len, paddr] {
+ }).si_then([&extent_list, &t, FNAME, type, laddr, len, paddr] {
DEBUGT("{} {}~0x{:x} {} is alive as {} extents",
- t, type, laddr, len, paddr, list.size());
+ t, type, laddr, len, paddr, extent_list.size());
return get_extents_if_live_ret(
interruptible::ready_future_marker{},
- std::move(list));
+ std::move(extent_list));
});
});
}).handle_error_interruptible(crimson::ct_error::enoent::handle([] {
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index c7a94a9ef11..e574460894a 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -23,6 +23,7 @@
#include "crimson/os/seastore/logging.h"
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/root_meta.h"
#include "crimson/os/seastore/lba_manager.h"
#include "crimson/os/seastore/backref_manager.h"
#include "crimson/os/seastore/journal.h"
@@ -136,14 +137,66 @@ public:
}
/**
+ * maybe_indirect_extent_t
+ *
+ * Contains necessary information in case the extent is loaded from an
+ * indirect pin.
+ */
+ struct indirect_info_t {
+ extent_len_t intermediate_offset = 0;
+ extent_len_t length = 0;
+ };
+ template <typename T>
+ struct maybe_indirect_extent_t {
+ TCachedExtentRef<T> extent;
+ std::optional<indirect_info_t> maybe_indirect_info;
+ bool is_clone = false;
+
+ bool is_indirect() const {
+ return maybe_indirect_info.has_value();
+ }
+
+ ceph::bufferlist get_bl() const {
+ if (is_indirect()) {
+ return do_get_indirect_range(0, maybe_indirect_info->length);
+ } else {
+ assert(extent->is_fully_loaded());
+ bufferlist bl;
+ bl.append(extent->get_bptr());
+ return bl;
+ }
+ }
+
+ ceph::bufferlist get_range(
+ extent_len_t offset, extent_len_t length) const {
+ if (is_indirect()) {
+ return do_get_indirect_range(offset, length);
+ } else {
+ return extent->get_range(offset, length);
+ }
+ }
+ private:
+ ceph::bufferlist do_get_indirect_range(
+ extent_len_t offset, extent_len_t length) const {
+ assert(is_indirect());
+ assert(maybe_indirect_info->intermediate_offset + offset + length <=
+ extent->get_length());
+ assert(offset + length <= maybe_indirect_info->length);
+ return extent->get_range(
+ maybe_indirect_info->intermediate_offset + offset,
+ length);
+ }
+ };
+
+ /**
* read_extent
*
* Read extent of type T at offset~length
*/
using read_extent_iertr = get_pin_iertr;
template <typename T>
- using read_extent_ret = read_extent_iertr::future<
- TCachedExtentRef<T>>;
+ using read_extent_ret =
+ read_extent_iertr::future<maybe_indirect_extent_t<T>>;
template <typename T>
read_extent_ret<T> read_extent(
Transaction &t,
@@ -191,12 +244,30 @@ public:
}
template <typename T>
- base_iertr::future<TCachedExtentRef<T>> read_pin(
+ base_iertr::future<maybe_indirect_extent_t<T>> read_pin(
Transaction &t,
- LBAMappingRef pin)
+ LBAMappingRef pin,
+ extent_len_t partial_off,
+ extent_len_t partial_len)
{
+ static_assert(is_logical_type(T::TYPE));
+ assert(is_aligned(partial_off, get_block_size()));
+ assert(is_aligned(partial_len, get_block_size()));
+
+ extent_len_t direct_partial_off = partial_off;
+ bool is_clone = pin->is_clone();
+ std::optional<indirect_info_t> maybe_indirect_info;
+ if (pin->is_indirect()) {
+ auto intermediate_offset = pin->get_intermediate_offset();
+ direct_partial_off = intermediate_offset + partial_off;
+ maybe_indirect_info = indirect_info_t{
+ intermediate_offset, pin->get_length()};
+ }
+
LOG_PREFIX(TransactionManager::read_pin);
- SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
+ SUBDEBUGT(seastore_tm, "{} {} 0x{:x}~0x{:x} direct_off=0x{:x} ...",
+ t, T::TYPE, *pin, partial_off, partial_len, direct_partial_off);
+
auto fut = base_iertr::make_ready_future<LBAMappingRef>();
if (!pin->is_parent_viewable()) {
if (pin->is_parent_valid()) {
@@ -213,21 +284,42 @@ public:
pin->maybe_fix_pos();
fut = base_iertr::make_ready_future<LBAMappingRef>(std::move(pin));
}
- return fut.si_then([&t, this](auto npin) mutable {
+ return fut.si_then([&t, this, direct_partial_off, partial_len](auto npin) {
// checking the lba child must be atomic with creating
// and linking the absent child
auto ret = get_extent_if_linked<T>(t, std::move(npin));
if (ret.index() == 1) {
- return std::move(std::get<1>(ret));
+ return std::get<1>(ret
+ ).si_then([direct_partial_off, partial_len, this, &t](auto extent) {
+ return cache->read_extent_maybe_partial(
+ t, std::move(extent), direct_partial_off, partial_len);
+ });
} else {
- return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
+ return this->pin_to_extent<T>(
+ t, std::move(std::get<0>(ret)), direct_partial_off, partial_len);
}
- }).si_then([FNAME, &t](TCachedExtentRef<T> ext) {
- SUBDEBUGT(seastore_tm, "got {}", t, *ext);
- return ext;
+ }).si_then([FNAME, maybe_indirect_info, is_clone, &t](TCachedExtentRef<T> ext) {
+ if (maybe_indirect_info.has_value()) {
+ SUBDEBUGT(seastore_tm, "got indirect +0x{:x}~0x{:x} is_clone={} {}",
+ t, maybe_indirect_info->intermediate_offset,
+ maybe_indirect_info->length, is_clone, *ext);
+ } else {
+ SUBDEBUGT(seastore_tm, "got direct is_clone={} {}",
+ t, is_clone, *ext);
+ }
+ return maybe_indirect_extent_t<T>{ext, maybe_indirect_info, is_clone};
});
}
+ template <typename T>
+ base_iertr::future<maybe_indirect_extent_t<T>> read_pin(
+ Transaction &t,
+ LBAMappingRef pin)
+ {
+ auto& pin_ref = *pin;
+ return read_pin<T>(t, std::move(pin), 0, pin_ref.get_length());
+ }
+
/// Obtain mutable copy of extent
LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -303,10 +395,6 @@ public:
len,
placement_hint,
INIT_GENERATION);
- if (!ext) {
- SUBERRORT(seastore_tm, "insufficient space!", t);
- return crimson::ct_error::enospc::make();
- }
return lba_manager->alloc_extent(
t,
laddr_hint,
@@ -342,10 +430,6 @@ public:
len,
placement_hint,
INIT_GENERATION);
- if (exts.empty()) {
- SUBERRORT(seastore_tm, "insufficient space!", t);
- return crimson::ct_error::enospc::make();
- }
return lba_manager->alloc_extents(
t,
laddr_hint,
@@ -362,7 +446,8 @@ public:
}
template <typename T>
- read_extent_ret<T> get_mutable_extent_by_laddr(
+ get_pin_iertr::future<TCachedExtentRef<T>>
+ get_mutable_extent_by_laddr(
Transaction &t,
laddr_t laddr,
extent_len_t len) {
@@ -374,8 +459,11 @@ public:
ceph_assert(!pin->is_clone());
ceph_assert(pin->get_length() == len);
return this->read_pin<T>(t, std::move(pin));
- }).si_then([this, &t, FNAME](auto extent) {
- auto ext = get_mutable_extent(t, extent)->template cast<T>();
+ }).si_then([this, &t, FNAME](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto ext = get_mutable_extent(
+ t, maybe_indirect_extent.extent)->template cast<T>();
SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext);
return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
std::move(ext));
@@ -438,6 +526,7 @@ public:
// The according extent might be stable or pending.
auto fut = base_iertr::now();
if (!pin->is_indirect()) {
+ ceph_assert(!pin->is_clone());
if (!pin->is_parent_viewable()) {
if (pin->is_parent_valid()) {
pin = pin->refresh_with_pending_parent();
@@ -458,7 +547,12 @@ public:
fut = fut.si_then([this, &t, &pin] {
if (full_extent_integrity_check) {
- return read_pin<T>(t, pin->duplicate());
+ return read_pin<T>(t, pin->duplicate()
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return maybe_indirect_extent.extent;
+ });
} else {
auto ret = get_extent_if_linked<T>(t, pin->duplicate());
if (ret.index() == 1) {
@@ -475,6 +569,7 @@ public:
? (ext && ext->is_fully_loaded())
: true);
std::optional<ceph::bufferptr> original_bptr;
+ // TODO: preserve the bufferspace if partially loaded
if (ext && ext->is_fully_loaded()) {
ceph_assert(!ext->is_mutable());
ceph_assert(ext->get_length() >= original_len);
@@ -646,8 +741,9 @@ public:
TransactionRef create_transaction(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint = CACHE_HINT_TOUCH,
bool is_weak=false) final {
- return cache->create_transaction(src, name, is_weak);
+ return cache->create_transaction(src, name, cache_hint, is_weak);
}
using ExtentCallbackInterface::submit_transaction_direct_ret;
@@ -690,9 +786,14 @@ public:
const std::string &key) {
return cache->get_root(
t
- ).si_then([&key, &t](auto root) {
+ ).si_then([&t, this](auto root) {
+ return read_extent<RootMetaBlock>(t, root->root.meta);
+ }).si_then([key, &t](auto maybe_indirect_extent) {
LOG_PREFIX(TransactionManager::read_root_meta);
- auto meta = root->root.get_meta();
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto& mblock = maybe_indirect_extent.extent;
+ auto meta = mblock->get_meta();
auto iter = meta.find(key);
if (iter == meta.end()) {
SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key);
@@ -701,7 +802,35 @@ public:
SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second);
return seastar::make_ready_future<read_root_meta_bare>(iter->second);
}
- });
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error!"}
+ );
+ }
+
+ /**
+ * init_root_meta
+ *
+ * create the root meta block
+ */
+ using init_root_meta_iertr = base_iertr;
+ using init_root_meta_ret = init_root_meta_iertr::future<>;
+ init_root_meta_ret init_root_meta(Transaction &t) {
+ return alloc_non_data_extent<RootMetaBlock>(
+ t, L_ADDR_MIN, RootMetaBlock::SIZE
+ ).si_then([this, &t](auto meta) {
+ meta->set_meta(RootMetaBlock::meta_t{});
+ return cache->get_root(t
+ ).si_then([this, &t, meta](auto root) {
+ auto mroot = cache->duplicate_for_write(
+ t, root)->template cast<RootBlock>();
+ mroot->root.meta = meta->get_laddr();
+ return seastar::now();
+ });
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error!"}
+ );
}
/**
@@ -719,15 +848,24 @@ public:
SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
return cache->get_root(
t
- ).si_then([this, &t, &key, &value](RootBlockRef root) {
- root = cache->duplicate_for_write(t, root)->cast<RootBlock>();
-
- auto meta = root->root.get_meta();
+ ).si_then([this, &t](RootBlockRef root) {
+ return read_extent<RootMetaBlock>(t, root->root.meta);
+ }).si_then([this, key, value, &t](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto& mblock = maybe_indirect_extent.extent;
+ mblock = get_mutable_extent(t, mblock
+ )->template cast<RootMetaBlock>();
+
+ auto meta = mblock->get_meta();
meta[key] = value;
- root->root.set_meta(meta);
+ mblock->set_meta(meta);
return seastar::now();
- });
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error!"}
+ );
}
/**
@@ -817,7 +955,7 @@ private:
shard_stats_t& shard_stats;
template <typename T>
- std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
+ std::variant<LBAMappingRef, get_child_ifut<T>>
get_extent_if_linked(
Transaction &t,
LBAMappingRef pin)
@@ -827,7 +965,8 @@ private:
// and linking the absent child
auto v = pin->get_logical_extent(t);
if (v.has_child()) {
- return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+ return v.get_child_fut(
+ ).si_then([pin=std::move(pin)](auto extent) {
#ifndef NDEBUG
auto lextent = extent->template cast<LogicalCachedExtent>();
auto pin_laddr = pin->get_key();
@@ -849,11 +988,17 @@ private:
extent_types_t type)
{
ceph_assert(!pin->parent_modified());
+ assert(!pin->is_indirect());
+ // Note: pin might be a clone
auto v = pin->get_logical_extent(t);
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return std::move(v.get_child_fut());
+ return std::move(v.get_child_fut()
+ ).si_then([type](auto ext) {
+ ceph_assert(ext->get_type() == type);
+ return ext;
+ });
} else {
return pin_to_extent_by_type(t, std::move(pin), type);
}
@@ -877,6 +1022,7 @@ private:
* pin_to_extent
*
* Get extent mapped at pin.
+ * partially load buffer from direct_partial_off~partial_len if not present.
*/
using pin_to_extent_iertr = base_iertr;
template <typename T>
@@ -885,18 +1031,28 @@ private:
template <typename T>
pin_to_extent_ret<T> pin_to_extent(
Transaction &t,
- LBAMappingRef pin) {
- LOG_PREFIX(TransactionManager::pin_to_extent);
- SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
+ LBAMappingRef pin,
+ extent_len_t direct_partial_off,
+ extent_len_t partial_len) {
static_assert(is_logical_type(T::TYPE));
using ret = pin_to_extent_ret<T>;
auto &pref = *pin;
+ auto direct_length = pref.is_indirect() ?
+ pref.get_intermediate_length() :
+ pref.get_length();
+ if (full_extent_integrity_check) {
+ direct_partial_off = 0;
+ partial_len = direct_length;
+ }
+ LOG_PREFIX(TransactionManager::pin_to_extent);
+ SUBTRACET(seastore_tm, "getting absent extent from pin {}, 0x{:x}~0x{:x} ...",
+ t, *pin, direct_partial_off, partial_len);
return cache->get_absent_extent<T>(
t,
pref.get_val(),
- pref.is_indirect() ?
- pref.get_intermediate_length() :
- pref.get_length(),
+ direct_length,
+ direct_partial_off,
+ partial_len,
[&pref]
(T &extent) mutable {
assert(!extent.has_laddr());
@@ -907,30 +1063,33 @@ private:
extent.maybe_set_intermediate_laddr(pref);
}
).si_then([FNAME, &t, pin=std::move(pin), this](auto ref) mutable -> ret {
- auto crc = ref->calc_crc32c();
- SUBTRACET(
- seastore_tm,
- "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}",
- t,
- *ref,
- pin->get_checksum(),
- crc);
- assert(ref->is_fully_loaded());
- bool inconsistent = false;
- if (full_extent_integrity_check) {
- inconsistent = (pin->get_checksum() != crc);
- } else { // !full_extent_integrity_check: remapped extent may be skipped
- inconsistent = !(pin->get_checksum() == 0 ||
- pin->get_checksum() == crc);
- }
- if (unlikely(inconsistent)) {
- SUBERRORT(seastore_tm,
- "extent checksum inconsistent, recorded: {}, actual: {}, {}",
+ if (ref->is_fully_loaded()) {
+ auto crc = ref->calc_crc32c();
+ SUBTRACET(
+ seastore_tm,
+ "got extent -- {}, chksum in the lba tree: 0x{:x}, actual chksum: 0x{:x}",
t,
+ *ref,
pin->get_checksum(),
- crc,
- *ref);
- ceph_abort();
+ crc);
+ bool inconsistent = false;
+ if (full_extent_integrity_check) {
+ inconsistent = (pin->get_checksum() != crc);
+ } else { // !full_extent_integrity_check: remapped extent may be skipped
+ inconsistent = !(pin->get_checksum() == 0 ||
+ pin->get_checksum() == crc);
+ }
+ if (unlikely(inconsistent)) {
+ SUBERRORT(seastore_tm,
+ "extent checksum inconsistent, recorded: 0x{:x}, actual: 0x{:x}, {}",
+ t,
+ pin->get_checksum(),
+ crc,
+ *ref);
+ ceph_abort();
+ }
+ } else {
+ assert(!full_extent_integrity_check);
}
return pin_to_extent_ret<T>(
interruptible::ready_future_marker{},
@@ -955,14 +1114,21 @@ private:
t, *pin, type);
assert(is_logical_type(type));
auto &pref = *pin;
+ laddr_t direct_key;
+ extent_len_t direct_length;
+ if (pref.is_indirect()) {
+ direct_key = pref.get_intermediate_base();
+ direct_length = pref.get_intermediate_length();
+ } else {
+ direct_key = pref.get_key();
+ direct_length = pref.get_length();
+ }
return cache->get_absent_extent_by_type(
t,
type,
pref.get_val(),
- pref.get_key(),
- pref.is_indirect() ?
- pref.get_intermediate_length() :
- pref.get_length(),
+ direct_key,
+ direct_length,
[&pref](CachedExtent &extent) mutable {
auto &lextent = static_cast<LogicalCachedExtent&>(extent);
assert(!lextent.has_laddr());
@@ -977,7 +1143,7 @@ private:
auto crc = ref->calc_crc32c();
SUBTRACET(
seastore_tm,
- "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}",
+ "got extent -- {}, chksum in the lba tree: 0x{:x}, actual chksum: 0x{:x}",
t,
*ref,
pin->get_checksum(),
@@ -992,7 +1158,7 @@ private:
}
if (unlikely(inconsistent)) {
SUBERRORT(seastore_tm,
- "extent checksum inconsistent, recorded: {}, actual: {}, {}",
+ "extent checksum inconsistent, recorded: 0x{:x}, actual: 0x{:x}, {}",
t,
pin->get_checksum(),
crc,